### From the command line (you must run this as admin or use sudo):

> 
> `python -m spacy download en_core_web_md`  
> `python -m spacy download en_core_web_lg`&emsp;&emsp;&ensp;*optional library*  
> `python -m spacy download en_vectors_web_lg`&emsp;*optional library*  

In [1]:
# spacy, textblob and nltk for language processing
from textblob import TextBlob, Word

In [4]:
# Import spaCy and load the language library
import spacy
nlp = spacy.load('en_core_web_lg')  # make sure to use a larger model!

OSError: [E050] Can't find model 'en_core_web_lg'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.

In [5]:
nlp('lion').vector.shape

(300,)

In [6]:
doc = nlp('The quick brown fox jumped over the lazy dogs.')
doc.vector.shape

(300,)

## Identifying similar vectors
The best way to expose vector relationships is through the `.similarity()` method of Doc tokens.

In [7]:
# Create a three-token Doc object:
tokens = nlp(u'lion cat pet electricity')

# Iterate through token combinations:
for token1 in tokens:
    for token2 in tokens:
        print(token1.text, token2.text, token1.similarity(token2))

lion lion 1.0
lion cat 0.52654374
lion pet 0.39923766
lion electricity 0.09239591
cat lion 0.52654374
cat cat 1.0
cat pet 0.7505456
cat electricity 0.13433506
pet lion 0.39923766
pet cat 0.7505456
pet pet 1.0
pet electricity 0.18337733
electricity lion 0.09239591
electricity cat 0.13433506
electricity pet 0.18337733
electricity electricity 1.0


## Vector norms
It's sometimes helpful to aggregate 300 dimensions into a [Euclidian (L2) norm](https://en.wikipedia.org/wiki/Norm_%28mathematics%29#Euclidean_norm), computed as the square root of the sum-of-squared-vectors. This is accessible as the `.vector_norm` token attribute. Other helpful attributes include `.has_vector` and `.is_oov` or *out of vocabulary*.

For example, our 685k vector library may not have the word "[nargle](https://en.wikibooks.org/wiki/Muggles%27_Guide_to_Harry_Potter/Magic/Nargle)". To test this:

In [8]:
tokens = nlp(u'dog cat nargle')

for token in tokens:
    print(token.text, token.has_vector, token.vector_norm, token.is_oov)

dog True 7.0336733 False
cat True 6.6808186 False
nargle False 0.0 True


### Vector arithmatic

In [311]:
from scipy import spatial

cosine_similarity = lambda x, y: 1 - spatial.distance.cosine(x, y)

king = nlp.vocab['king'].vector
man = nlp.vocab['man'].vector
woman = nlp.vocab['woman'].vector

# Now we find the closest vector in the vocabulary to the result of "man" - "woman" + "queen"
new_vector = king - man + woman
computed_similarities = []

for word in nlp.vocab:
    # Ignore words without vectors and mixed-case words:
    if word.has_vector:
        if word.is_lower:
            if word.is_alpha:# not number or something
                similarity = cosine_similarity(new_vector, word.vector)
                computed_similarities.append((word, similarity))

computed_similarities = sorted(computed_similarities, key=lambda item: -item[1])

print([w[0].text for w in computed_similarities[:10]])

['king', 'queen', 'prince', 'kings', 'princess', 'royal', 'throne', 'queens', 'monarch', 'kingdom']


## NLTK's VADER module
VADER is an NLTK module that provides sentiment scores based on words used ("completely" boosts a score, while "slightly" reduces it), on capitalization & punctuation ("GREAT!!!" is stronger than "great."), and negations (words like "isn't" and "doesn't" affect the outcome).
<br>To view the source code visit https://www.nltk.org/_modules/nltk/sentiment/vader.html

In [312]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ashamsa\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [313]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

In [314]:
a = 'This was a good movie.'
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.508, 'pos': 0.492, 'compound': 0.4404}

In [315]:
a = 'This was the best, most awesome movie EVER MADE!!!'
sid.polarity_scores(a)

{'neg': 0.0, 'neu': 0.425, 'pos': 0.575, 'compound': 0.8877}

In [316]:
a = 'This was the worst film to ever disgrace the screen.'
sid.polarity_scores(a)

{'neg': 0.477, 'neu': 0.523, 'pos': 0.0, 'compound': -0.8074}

## Text Blob

In [9]:
def estimate_polarity(text):
    return TextBlob(text).sentiment.polarity

In [10]:
a = 'This was a good movie.'
estimate_polarity(a)

0.7

In [12]:
a = 'This was the best, most awesome movie EVER MADE!!!'
estimate_polarity(a)

0.8333333333333334

In [13]:
a = 'This was the worst film to ever disgrace the screen.'
estimate_polarity(a)

-1.0

## Use VADER to analyze Amazon Reviews
For this exercise we're going to apply `SentimentIntensityAnalyzer` to a dataset of 10,000 Amazon reviews. Like our movie reviews datasets, these are labeled as either "pos" or "neg". At the end we'll determine the accuracy of our sentiment analysis with VADER.

In [15]:
import numpy as np
import pandas as pd

df = pd.read_csv('data/amazonreviews.tsv', sep='\t')
df.head()

Unnamed: 0,label,review
0,pos,Stuning even for the non-gamer: This sound tra...
1,pos,The best soundtrack ever to anything.: I'm rea...
2,pos,Amazing!: This soundtrack is my favorite music...
3,pos,Excellent Soundtrack: I truly like this soundt...
4,pos,"Remember, Pull Your Jaw Off The Floor After He..."


In [318]:
df['label'].value_counts()

neg    5097
pos    4903
Name: label, dtype: int64

In [319]:
sid.polarity_scores(df.loc[0]['review'])

{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'compound': 0.9454}

In [320]:
df.loc[0]['label']

'pos'

### Clean the data (optional):
Recall that our moviereviews.tsv file contained empty records. Let's check to see if any exist in amazonreviews.tsv.

In [321]:
# REMOVE NaN VALUES AND EMPTY STRINGS:
df.dropna(inplace=True)

blanks = []  # start with an empty list

for i,lb,rv in df.itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)     # add matching index numbers to the list

df.drop(blanks, inplace=True)

## Adding Scores and Labels to the DataFrame
In this next section we'll add columns to the original DataFrame to store polarity_score dictionaries, extracted compound scores, and new "pos/neg" labels derived from the compound score. We'll use this last column to perform an accuracy test.

In [322]:
df['scores'] = df['review'].apply(lambda review: sid.polarity_scores(review))
df['compound']  = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['comp_score'] = df['compound'].apply(lambda c: 'pos' if c >=0 else 'neg')

In [323]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


## Report on Accuracy
Finally, we'll use scikit-learn to determine how close VADER came to our original 10,000 labels.

In [20]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix


In [325]:
accuracy_score(df['label'],df['comp_score'])

0.7091

In [326]:
print(classification_report(df['label'],df['comp_score']))

              precision    recall  f1-score   support

         neg       0.86      0.51      0.64      5097
         pos       0.64      0.91      0.75      4903

   micro avg       0.71      0.71      0.71     10000
   macro avg       0.75      0.71      0.70     10000
weighted avg       0.75      0.71      0.70     10000



In [327]:
print(confusion_matrix(df['label'],df['comp_score']))

[[2623 2474]
 [ 435 4468]]


### Text blob

In [17]:
df['scores'] = df['review'].apply(lambda review: estimate_polarity(review))
df.head()

Unnamed: 0,label,review,scores
0,pos,Stuning even for the non-gamer: This sound tra...,-0.021875
1,pos,The best soundtrack ever to anything.: I'm rea...,0.261111
2,pos,Amazing!: This soundtrack is my favorite music...,0.274691
3,pos,Excellent Soundtrack: I truly like this soundt...,0.272727
4,pos,"Remember, Pull Your Jaw Off The Floor After He...",0.324802


In [18]:
df['comp_score'] = df['scores'].apply(lambda c: 'pos' if c >=0 else 'neg')
df.head()

Unnamed: 0,label,review,scores,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,-0.021875,neg
1,pos,The best soundtrack ever to anything.: I'm rea...,0.261111,pos
2,pos,Amazing!: This soundtrack is my favorite music...,0.274691,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,0.272727,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...",0.324802,pos


In [21]:
accuracy_score(df['label'],df['comp_score'])

0.6881

## ML model-----------------Linear SVC

In [328]:
from sklearn.model_selection import train_test_split

X = df['review']  # this time we want to look at the text
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [329]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.svm import LinearSVC
nlp = spacy.load('en_core_web_sm')

def model_Pipeline():
    pipeline=Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
                    ])
    return pipeline

model=model_Pipeline()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [330]:
pd.DataFrame(confusion_matrix(y_test,predictions), index=['Pos','Neg'], columns=['Pos','Neg'])

Unnamed: 0,Pos,Neg
Pos,1467,182
Neg,244,1407


In [331]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

         neg       0.86      0.89      0.87      1649
         pos       0.89      0.85      0.87      1651

   micro avg       0.87      0.87      0.87      3300
   macro avg       0.87      0.87      0.87      3300
weighted avg       0.87      0.87      0.87      3300



In [332]:
print(accuracy_score(y_test,predictions))

0.8709090909090909


## Grid Search

In [333]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

In [345]:
def model_Pipeline(): 
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('clf', RandomForestClassifier())
    ])
    return pipeline

In [346]:
model_Pipeline().get_params().keys()

dict_keys(['memory', 'steps', 'tfidf', 'clf', 'tfidf__analyzer', 'tfidf__binary', 'tfidf__decode_error', 'tfidf__dtype', 'tfidf__encoding', 'tfidf__input', 'tfidf__lowercase', 'tfidf__max_df', 'tfidf__max_features', 'tfidf__min_df', 'tfidf__ngram_range', 'tfidf__norm', 'tfidf__preprocessor', 'tfidf__smooth_idf', 'tfidf__stop_words', 'tfidf__strip_accents', 'tfidf__sublinear_tf', 'tfidf__token_pattern', 'tfidf__tokenizer', 'tfidf__use_idf', 'tfidf__vocabulary', 'clf__bootstrap', 'clf__class_weight', 'clf__criterion', 'clf__max_depth', 'clf__max_features', 'clf__max_leaf_nodes', 'clf__min_impurity_decrease', 'clf__min_impurity_split', 'clf__min_samples_leaf', 'clf__min_samples_split', 'clf__min_weight_fraction_leaf', 'clf__n_estimators', 'clf__n_jobs', 'clf__oob_score', 'clf__random_state', 'clf__verbose', 'clf__warm_start'])

In [347]:
pipeline = model_Pipeline()
param = {'clf__n_estimators': [10, 150, 300],
         'clf__max_depth': [30, 60, 90, None]}
model = GridSearchCV(pipeline, param, cv=5, n_jobs=-1)
model.fit(X_train, y_train);
predictions = model.predict(X_test)
# display_results(y_test, y_pred)

In [348]:
print(accuracy_score(y_test,predictions))

0.8342424242424242


## ML with extracted sentiments from NTLK

In [349]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos


In [350]:
df['positive'] = df['scores'].apply(lambda score_dict: score_dict['pos'])
df['negative'] = df['scores'].apply(lambda score_dict: score_dict['neg'])
df['neutral'] = df['scores'].apply(lambda score_dict: score_dict['neu'])

In [351]:
df.head()

Unnamed: 0,label,review,scores,compound,comp_score,positive,negative,neutral
0,pos,Stuning even for the non-gamer: This sound tra...,"{'neg': 0.088, 'neu': 0.669, 'pos': 0.243, 'co...",0.9454,pos,0.243,0.088,0.669
1,pos,The best soundtrack ever to anything.: I'm rea...,"{'neg': 0.018, 'neu': 0.837, 'pos': 0.145, 'co...",0.8957,pos,0.145,0.018,0.837
2,pos,Amazing!: This soundtrack is my favorite music...,"{'neg': 0.04, 'neu': 0.692, 'pos': 0.268, 'com...",0.9858,pos,0.268,0.04,0.692
3,pos,Excellent Soundtrack: I truly like this soundt...,"{'neg': 0.09, 'neu': 0.615, 'pos': 0.295, 'com...",0.9814,pos,0.295,0.09,0.615
4,pos,"Remember, Pull Your Jaw Off The Floor After He...","{'neg': 0.0, 'neu': 0.746, 'pos': 0.254, 'comp...",0.9781,pos,0.254,0.0,0.746


In [352]:
X = df[['positive','negative','neutral']]  # this time we want to look at the text
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [353]:
def model_Pipeline():
    pipeline=Pipeline([
                     ('clf', LinearSVC()),
                    ])
    return pipeline

model=model_Pipeline()
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [354]:
print(accuracy_score(y_test,predictions))

0.7721212121212121


## ML wih reviews and NTLK sentiments

https://scikit-learn.org/stable/auto_examples/compose/plot_column_transformer_mixed_types.html

In [387]:
X = df[['review','positive','negative','neutral']]  # this time we want to look at the text
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [406]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import make_column_transformer
preprocessor = make_column_transformer(
    (StandardScaler(),['positive','negative','neutral']),
        ( TfidfVectorizer(),'review')
    
)

In [407]:
def model_Pipeline():
    pipeline=Pipeline([
                    ('preprocessor', preprocessor),
                     ('clf', LinearSVC()),
                    ])
    return pipeline



In [408]:
model=model_Pipeline()


In [409]:
model.fit(X_train, y_train)


Pipeline(memory=None,
     steps=[('preprocessor', ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
         transformer_weights=None,
         transformers=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True), ['positive', 'negative', 'neutral']), ('tfidfvectorizer', TfidfVectori...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [410]:
predictions = model.predict(X_test)

In [411]:
print(accuracy_score(y_test,predictions))

0.8724242424242424
