In [1]:
import numpy as np
import pandas as pd

df = pd.read_csv('moviereviews2.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,I loved this movie and will watch it again. Or...
1,pos,"A warm, touching movie that has a fantasy-like..."
2,pos,I was not expecting the powerful filmmaking ex...
3,neg,"This so-called ""documentary"" tries to tell tha..."
4,pos,This show has been my escape from reality for ...


In [2]:
len(df)

6000

In [3]:
from IPython.display import Markdown, display
display(Markdown('> '+df['review'][0]))

> I loved this movie and will watch it again. Original twist to Plot of Man vs Man vs Self. I think this is Kurt Russell's best movie. His eyes conveyed more than most actors words. Perhaps there's hope for Mankind in spite of Government Intervention?

In [4]:
df.isnull().sum()

label      0
review    20
dtype: int64

In [5]:
df.dropna(inplace=True)

len(df)

5980

In [6]:
blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list
        
print(len(blanks), 'blanks: ', blanks)

0 blanks:  []


In [7]:
df.drop(blanks, inplace=True)

len(df)

5980

In [8]:
df['label'].value_counts()

neg    2990
pos    2990
Name: label, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

X = df['review']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [10]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

# Naïve Bayes:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

In [11]:
text_clf_nb.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [12]:
predictions = text_clf_nb.predict(X_test)

In [13]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))
print(metrics.classification_report(y_test,predictions))

[[940  51]
 [136 847]]
              precision    recall  f1-score   support

         neg       0.87      0.95      0.91       991
         pos       0.94      0.86      0.90       983

    accuracy                           0.91      1974
   macro avg       0.91      0.91      0.91      1974
weighted avg       0.91      0.91      0.91      1974



In [14]:
text_clf_lsvc.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, intercept_scaling=1,
               

In [15]:
predictions = text_clf_lsvc.predict(X_test)

In [16]:
from sklearn import metrics
print(metrics.confusion_matrix(y_test,predictions))
print(metrics.classification_report(y_test,predictions))

[[900  91]
 [ 63 920]]
              precision    recall  f1-score   support

         neg       0.93      0.91      0.92       991
         pos       0.91      0.94      0.92       983

    accuracy                           0.92      1974
   macro avg       0.92      0.92      0.92      1974
weighted avg       0.92      0.92      0.92      1974



In [49]:
import spacy
import en_core_web_md
import en_core_web_lg

In [50]:
nlp = en_core_web_lg.load()

In [30]:
nlp(u'girl').vector

array([-2.6909e-01,  2.5307e-01, -5.7593e-01,  1.6235e-01,  1.6094e-01,
       -1.9802e-01, -2.8971e-02, -2.5352e-01, -7.4811e-02,  2.1331e+00,
       -4.6460e-01,  5.6153e-02, -5.0651e-01, -4.4885e-01, -4.7379e-01,
        4.4561e-01, -1.2656e-01,  6.4420e-01,  1.6048e-01,  3.4406e-01,
        1.1782e-01, -5.4542e-01, -3.9005e-01, -1.7133e-01, -1.0692e-01,
        4.0355e-01,  1.4711e-01, -2.1785e-01,  2.3270e-01,  4.2617e-03,
       -1.8534e-01,  6.4222e-02,  2.8307e-01, -4.6408e-03,  6.7360e-01,
       -1.2961e-01,  3.6804e-02, -2.9297e-01, -1.0704e-01, -1.0144e-01,
        3.6363e-01, -3.4643e-01,  8.9158e-02, -3.3299e-01, -1.4088e-01,
       -1.1175e-03, -1.5586e-01, -9.2016e-02,  2.0663e-01, -7.1369e-01,
       -4.2443e-01, -2.0274e-01, -4.9529e-01, -1.4574e-01, -1.2492e-01,
        4.8865e-01,  9.1048e-02, -4.8057e-02,  2.1768e-01,  1.3217e-02,
       -1.2011e-01, -3.8334e-01, -7.9559e-02, -2.4453e-01, -2.0607e-01,
       -1.6878e-01, -1.3342e-01,  4.5845e-01,  4.1422e-02,  2.04

In [31]:
# Create a three-token Doc object:
tokens = nlp(u'lion cat pet')

# Iterate through token combinations:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

lion lion 1.0
lion cat 0.52654374
lion pet 0.39923766
cat lion 0.52654374
cat cat 1.0
cat pet 0.7505456
pet lion 0.39923766
pet cat 0.7505456
pet pet 1.0


In [51]:
nlp.vocab.vectors.shape

(684831, 300)

In [33]:
tokens = nlp(u'love wife Simran')

In [35]:
for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

love True 6.04035 False
wife True 6.672992 False
Simran True 7.3687935 False


In [36]:
from scipy import spatial
cosine_similarity = lambda vec1, vec2: 1 - spatial.distance.cosine(vec1, vec2) 

In [37]:
king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

In [38]:
# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = king - man + woman
computed_similarities = []

In [39]:
for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

In [40]:
computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

In [41]:
print([w[0].text for w in computed_similarities[:10]])

['king', 'queen', 'commoner', 'highness', 'prince', 'sultan', 'maharajas', 'princes', 'kumbia', 'kings']
