In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('data.csv')
df.head()


Unnamed: 0,target,review
0,0,The film starts with a manager (Nicholas Bell)...
1,0,It must be assumed that those who praised this...
2,0,"This movie could have been very good, but come..."
3,0,I watched this video at a friend's house. I'm ...
4,0,"A friend of mine bought this film for £1, and ..."


In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['review'],
                                                    df['target'],
                                                    train_size = 0.8,
                                                    test_size = 0.2,
                                                    # random_state=14
                                                    )

In [6]:
print('X_train first entry:\n\n', X_train[0])
print('\n\nX_train shape: ', X_train.shape)

X_train first entry:

 The film starts with a manager (Nicholas Bell) giving welcome investors (Robert Carradine) to Primal Park . A secret project mutating a primal animal using fossilized DNA, like ¨Jurassik Park¨, and some scientists resurrect one of nature's most fearsome predators, the Sabretooth tiger or Smilodon . Scientific ambition turns deadly, however, and when the high voltage fence is opened the creature escape and begins savagely stalking its prey - the human visitors , tourists and scientific.Meanwhile some youngsters enter in the restricted area of the security center and are attacked by a pack of large pre-historical animals which are deadlier and bigger . In addition , a security agent (Stacy Haiduk) and her mate (Brian Wimmer) fight hardly against the carnivorous Smilodons. The Sabretooths, themselves , of course, are the real star stars and they are astounding terrifyingly though not convincing. The giant animals savagely are stalking its prey and the group run afou

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer().fit(X_train)


In [8]:
vect.get_feature_names()[::2000]

['00',
 'afterlives',
 'armitage',
 'beaudine',
 'bots',
 'caroles',
 'clocks',
 'councellor',
 'defying',
 'documenter',
 'emigrate',
 'famine',
 'francois',
 'gonads',
 'headedness',
 'ibsen',
 'irl',
 'kirshner',
 'lingers',
 'marry',
 'mixtures',
 'nevermind',
 'ovas',
 'physicallity',
 'projected',
 'recreated',
 'rollins',
 'seaquest',
 'sking',
 'stallyns',
 'swigs',
 'tissue',
 'uncomprehended',
 'vilest',
 'woodmobile']

In [9]:
len(vect.get_feature_names())

69167

In [10]:
X_train_vectorized = vect.transform(X_train)

In [11]:
X_train_vectorized

<20000x69167 sparse matrix of type '<class 'numpy.int64'>'
	with 2735524 stored elements in Compressed Sparse Row format>

In [12]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [13]:
from sklearn.metrics import roc_auc_score  

predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.881717075131


In [14]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs:\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'disappointment' 'waste' 'poorly' 'awful' 'boring' 'lacks'
 'mildly' 'disappointing' 'mess']

Largest Coefs:
['refreshing' 'perfect' 'erotic' 'appreciated' 'excellent' 'wonderfully'
 'rare' 'surprisingly' 'superb' 'carrey']


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vect = TfidfVectorizer(min_df=5).fit(X_train)
len(vect.get_feature_names())

24390

In [16]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.884315901579


In [17]:
X_train_vectorized = vect.transform(X_train)

model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.884315901579


In [18]:
feature_names = np.array(vect.get_feature_names())
sorted_tfidf_index = X_train_vectorized.max(0).toarray()[0].argsort()

print('Smallest tfidf:\n{}\n'.format(feature_names[sorted_tfidf_index[:10]]))
print('Largest tfidf:\n{}'.format(feature_names[sorted_tfidf_index[:-11:-1]]))

Smallest tfidf:
['cruiserweight' 'suplexes' 'pup' 'annihilated' 'gauche' 'statistic'
 'booed' 'hypocrites' 'mimics' 'oncoming']

Largest tfidf:
['name' 'pokemon' 'steve' 'dev' 'smallville' 'wei' 'woo' 'doodlebops'
 'casper' 'weller']


In [19]:
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs:\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'bad' 'awful' 'boring' 'waste' 'poor' 'nothing' 'terrible' 'no'
 'worse']

Largest Coefs:
['great' 'excellent' 'best' 'perfect' 'wonderful' 'well' 'amazing' 'loved'
 'love' 'favorite']


In [20]:
vect = CountVectorizer(min_df=5, ngram_range=(1,2)).fit(X_train)
X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

128769

In [21]:
model = LogisticRegression()
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.896329193644


In [35]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [23]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs:\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'awful' 'boring' 'waste' 'disappointment' 'poor' 'poorly'
 'disappointing' 'the worst' 'lame']

Largest Coefs:
['excellent' 'perfect' 'wonderful' 'superb' 'amazing' 'enjoyable'
 'brilliant' 'well worth' 'rare' 'refreshing']


In [1]:
print('The END')

The END


In [15]:
vect = CountVectorizer(min_df=5,
                       ngram_range=(1,3),
                       # stop_words='english'
                       ).fit(X_train)
X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

201791

In [63]:
model = LogisticRegression()                        # With 75% data
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('AUC: ', roc_auc_score(y_test, predictions))

AUC:  0.898928105737


In [65]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs:\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['worst' 'awful' 'boring' 'waste' 'disappointing' 'poor' 'poorly'
 'terrible' 'lacks' 'disappointment']

Largest Coefs:
['excellent' 'wonderful' 'perfect' 'great' 'superb' 'brilliant' 'enjoyable'
 'amazing' 'enjoyed' 'must see']


In [16]:
model = LogisticRegression()            # 80% data, 1-3grams, min_df=5
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('Accuracy: ', roc_auc_score(y_test, predictions))

Accuracy:  0.898117746903


In [29]:
vect = CountVectorizer(min_df=5,
                       ngram_range=(1,3),       # now without stopwords
                       # stop_words='english'
                       ).fit(X_train)
X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

201791

In [30]:
model = LogisticRegression()            # 80% data, 1-3grams, min_df=5, w/o stopwords
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('Accuracy: ', roc_auc_score(y_test, predictions))

Accuracy:  0.898117746903


In [18]:
vect = CountVectorizer(analyzer = 'char',
                       min_df=5,
                       ngram_range=(1,3),       
                       # stop_words='english'
                       ).fit(X_train)
X_train_vectorized = vect.transform(X_train)

len(vect.get_feature_names())

19898

In [19]:
model = LogisticRegression()            
model.fit(X_train_vectorized, y_train)

predictions = model.predict(vect.transform(X_test))
print('Accuracy: ', roc_auc_score(y_test, predictions))

Accuracy:  0.84337182451


In [22]:
print(model.predict(vect.transform("txt_sentoken/neg/cv000_29416.txt")))

NameError: name 'r' is not defined

In [20]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = model.coef_[0].argsort()

print('Smallest Coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest Coefs:\n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest Coefs:
['dul' 'ntm' 'hac' '4/1' 'lri' ' ok' 'x i' 'b m' 'oor' 'd!!']

Largest Coefs:
['7/' '7/1' 'rld' 't 5' 'm..' ' 8' 'awe' 'a 9' 'ct.' 's.b']


In [None]:
test_data = pd