In [1]:
import os
import numpy as np 
import nltk
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, f1_score

In [2]:
path = os.getcwd()

In [4]:
fn = os.path.join(path, 'data/rt-polarity.neg')

with open(fn, "r",encoding='latin-1') as f:
    content = f.read()  
texts_neg = np.array(content.splitlines())

for review in texts_neg[:5]:
    print ('\n', review)


 simplistic , silly and tedious . 

 it's so laddish and juvenile , only teenage boys could possibly find it funny . 

 exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . 

 [garbus] discards the potential for pathological study , exhuming instead , the skewed melodrama of the circumstantial situation . 

 a visually flashy but narratively opaque and emotionally vapid exercise in style and mystification . 


In [5]:
fn = os.path.join(path, 'data/rt-polarity.pos')

with open(fn, "r",encoding='latin-1') as f:
    content = f.read()
texts_pos = np.array(content.splitlines())

for review in texts_pos[:5]:
    print ('\n', review)


 the rock is destined to be the 21st century's new " conan " and that he's going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal . 

 the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson's expanded vision of j . r . r . tolkien's middle-earth . 

 effective but too-tepid biopic

 if you sometimes like to go to the movies to have fun , wasabi is a good place to start . 

 emerges as something rare , an issue movie that's so honest and keenly observed that it doesn't feel like one . 


In [6]:
texts_neg = texts_neg.reshape(-1,1)
texts_neg = np.append(texts_neg,np.zeros(texts_neg.shape),axis=1)

texts_pos = texts_pos.reshape(-1,1)
texts_pos = np.append(texts_pos,np.ones(texts_pos.shape),axis=1)

In [7]:
data = np.append(texts_neg, texts_pos, axis = 0)
data

array([['simplistic , silly and tedious . ', '0.0'],
       ["it's so laddish and juvenile , only teenage boys could possibly find it funny . ",
        '0.0'],
       ['exploitative and largely devoid of the depth or sophistication that would make watching such a graphic treatment of the crimes bearable . ',
        '0.0'],
       ...,
       ['standing in the shadows of motown is the best kind of documentary , one that makes a depleted yesterday feel very much like a brand-new tomorrow . ',
        '1.0'],
       ["it's nice to see piscopo again after all these years , and chaykin and headly are priceless . ",
        '1.0'],
       ['provides a porthole into that noble , trembling incoherence that defines us all . ',
        '1.0']], dtype='<U269')

In [8]:
X_train, X_test, y_train, y_test = train_test_split(data[:,0], data[:,1].astype(np.float).astype(np.int16), 
                                                    random_state = 2019 )  

In [9]:
vect = CountVectorizer(ngram_range=(1,2)).fit(X_train)
len(vect.get_feature_names())

100077

In [10]:
X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

In [11]:
clf = LogisticRegression(C = 3, tol = 0.06, max_iter = 200, random_state = 2019).fit(X_train_vectorized, y_train)



In [12]:
predictions = clf.predict(X_test_vectorized) # Predict the transformed test documents
print('f1: %0.03f' %f1_score(y_test, predictions))
scores = clf.decision_function(X_test_vectorized) 
print('AUC: %0.03f ' %roc_auc_score(y_test, scores)) 
print('Accuracy: %0.03f' %clf.score(X_test_vectorized,y_test))

f1: 0.767
AUC: 0.839 
Accuracy: 0.769


In [13]:
feature_names = np.array(vect.get_feature_names())
sorted_coef_index = clf.coef_[0].argsort()
print('Smallest coefs:\n{}\n'.format(feature_names[sorted_coef_index[:10]]))
print('Largest coefs: \n{}'.format(feature_names[sorted_coef_index[:-11:-1]]))

Smallest coefs:
['dull' 'boring' 'routine' 'pretentious' 'too' 'flat' 'unfunny' 'worst'
 'tedious' 'mediocre']

Largest coefs: 
['wonderful' 'solid' 'fun' 'enjoyable' 'treat' 'still' 'cinema'
 'masterpiece' 'smart' 'refreshing']
