## Turn on multiline output 

In [1]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Amazon Food Review Analysis

Data Source: https://www.kaggle.com/snap/amazon-fine-food-reviews <br>

EDA: https://nycdatascience.com/blog/student-works/amazon-fine-foods-visualization/


The Amazon Fine Food Reviews dataset consists of reviews of fine foods from Amazon.<br>

Number of reviews: 568,454<br>
Number of users: 256,059<br>
Number of products: 74,258<br>
Timespan: Oct 1999 - Oct 2012<br>
Number of Attributes/Columns in data: 10 

Attribute Information:

1. Id
2. ProductId - unique identifier for the product
3. UserId - unqiue identifier for the user
4. ProfileName
5. HelpfulnessNumerator - number of users who found the review helpful
6. HelpfulnessDenominator - number of users who indicated whether they found the review helpful or not
7. Score - rating between 1 and 5
8. Time - timestamp for the review
9. Summary - brief summary of the review
10. Text - text of the review


#### Objective:
Given a review, determine whether the review is positive (Rating of 4 or 5) or negative (rating of 1 or 2).

<br>
[Q] How to determine if a review is positive or negative?<br>
<br> 
[Ans] We could use the Score/Rating. A rating of 4 or 5 could be cosnidered a positive review. A review of 1 or 2 could be considered negative. A review of 3 is nuetral and ignored. This is an approximate and proxy way of determining the polarity (positivity/negativity) of a review.


### Load the data using sqlite file 

In [1]:
import sqlite3
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer

import re

from gensim.models import Word2Vec
from gensim.models import keyedvectors

import pickle

from tqdm import tqdm

import os

*download dataset from this link* : 
https://www.kaggle.com/snap/amazon-fine-food-reviews

In [2]:
con = sqlite3.connect("dataset/database.sqlite")

In [4]:
# now only fetch those rows which has score 1,2,4 and 5
data = pd.read_sql_query("SELECT * FROM Reviews WHERE Score!=3", con)
data.head(3)
data.shape

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


(525814, 10)

In [5]:
# change reviews with value 4 and 5 ==> positive (value 1)
# and 1 and 2 ==> negative (value 0)
def positive_negative(review):
    if(review > 3):
        return 1
    else:
        return 0

scores = data["Score"]
scores = scores.map(positive_negative)
data["Score"] = scores
data.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,1,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,0,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,1,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


### Data Cleaning 

In [6]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND UserId="AR5J8UI46CURR"
ORDER BY ProductID
""", con)
display.head(3)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [7]:
# Sorting data according to ProductId in ascending order
sorted_data = data.sort_values("ProductId", axis=0, ascending=True, inplace=False, kind="quicksort", na_position="last")

In [8]:
# remove duplicate data rows means deduplication
final_data = sorted_data.drop_duplicates(subset={"UserId", "ProfileName","Time","Text"}, keep="first", inplace=False)
final_data.shape

(364173, 10)

In [9]:
# See how much data we holds now
final_data.shape[0]*100 / data.shape[0]

69.25890143662969

**Observation**:- It was also seen that in two rows given below the value of HelpfulnessNumerator is greater than HelpfulnessDenominator which is not practically possible hence these two rows too are removed from calcualtions

In [10]:
display= pd.read_sql_query("""
SELECT *
FROM Reviews
WHERE Score != 3 AND Id=44737 OR Id=64422
ORDER BY ProductID
""", con)

display.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,64422,B000MIDROQ,A161DK06JJMCYF,"J. E. Stephens ""Jeanne""",3,1,5,1224892800,Bought This for My Son at College,My son loves spaghetti so I didn't hesitate or...
1,44737,B001EQ55RW,A2V0I904FH7ABY,Ram,3,2,4,1212883200,Pure cocoa taste with crunchy almonds inside,It was almost a 'love at first bite' - the per...


In [11]:
final_data = final_data[final_data.HelpfulnessNumerator <= final_data.HelpfulnessDenominator]

In [12]:
final_data["Score"].value_counts()
# 1 ==> +v1 ,  0 ==> -ve

1    307061
0     57110
Name: Score, dtype: int64

### Text Preprocessing 

Now that we have finished deduplication our data requires some preprocessing before we go on further with analysis and making the prediction model.

Hence in the Preprocessing phase we do the following in the order below:-

1. Begin by removing the html tags
2. Remove any punctuations or limited set of special characters like , or . or # etc.
3. Check if the word is made up of english letters and is not alpha-numeric
4. Check to see if the length of the word is greater than 2 (as it was researched that there is no adjective in 2-letters)
5. Convert the word to lowercase
6. Remove Stopwords
7. Finally Snowball Stemming the word (it was obsereved to be better than Porter Stemming)<br>

After which we collect the words used to describe positive and negative reviews

#### remove HTML tags

In [13]:
# find sentences containing HTML tags
index = 0 
for sent in final_data["Text"]:
    if(len(re.findall('<.*?>', sent))):
        print(index, " ------ ", sent)
        break
    index += 1

6  ------  I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [14]:
# remover html tags
def cleanhtml(sentense):
    pattern = re.compile("<.*?>")
    sentense = re.sub(pattern, " ", sentense)
    return sentense

cleanhtml(final_data["Text"].values[6])
# exp
# we removed all the tags from review[6]

"I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.  First, this book taught him the months of the year.  Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.  Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best."

#### Remove punctuations and special characters

In [15]:
# punctuations and special charcters : \?!,;:_\-\.\*\(\)\{\}\[\]\"~`@#\$%\^&\+\|\=\/-
index = 0
for sent in final_data["Text"]:
    if(len(re.findall("[\?!,;:_\-\.\*\(\)\{\}\[\]\"~`@#\$%\^&\+\|\=\/-]", sent))):
        print(index, " ===== ", sent)
        break
    index += 1

0  =====  this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college


In [16]:
# remove punctuations and special charcters
def cleanpunc(sentense):
    pattern = "[\?!,;:_\-\.\*\(\)\{\}\[\]\"~`@#\$%\^&\+\|\=\/-]"
    sentense = re.sub(pattern, " ", sentense)
    return sentense

cleanpunc(final_data["Text"].values[0])

"this witty little book makes my son laugh at loud  i recite it in the car as we're driving along and he always can sing the refrain  he's learned about whales  India  drooping roses   i love all the new words this book  introduces and the silliness of it all   this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college"

####   words which are made up of english letters

In [17]:
# words which are made up of english letters like {won't} ==> {will not}
index = 0
for sent in final_data["Text"]:
    if(len(re.findall("won't", sent))):
        print(index, " ===== ", sent)
        break
    index += 1

3  =====  This is a great little book to read aloud- it has a nice rhythm as well as good repetition that little ones like, in the lines about "chicken soup with rice".  The child gets to go through the months of the year and go to wonderful places like Bombay and down the Nile, all the while, eating, well, you know what they get to eat.  Some kids will have Maurice Sendak's version of ice skating or how to treat roses in their heads for a long time and they won't even know where it came from.  Surprise!  It came from this little witty book.  :-)


In [18]:
# remove words which are made up of english letters

def decontracted(sentense):
    # specific
    sentense = re.sub(r"won't", "will not", sentense)
    sentense = re.sub(r"can\'t", "can not", sentense)

    # general
    sentense = re.sub(r"n\'t", " not", sentense)
    sentense = re.sub(r"\'re", " are", sentense)
    sentense = re.sub(r"\'s", " is", sentense)
    sentense = re.sub(r"\'d", " would", sentense)
    sentense = re.sub(r"\'ll", " will", sentense)
    sentense = re.sub(r"\'t", " not", sentense)
    sentense = re.sub(r"\'ve", " have", sentense)
    sentense = re.sub(r"\'m", " am", sentense)
    return sentense

decontracted(final_data["Text"].values[3])

'This is a great little book to read aloud- it has a nice rhythm as well as good repetition that little ones like, in the lines about "chicken soup with rice".  The child gets to go through the months of the year and go to wonderful places like Bombay and down the Nile, all the while, eating, well, you know what they get to eat.  Some kids will have Maurice Sendak is version of ice skating or how to treat roses in their heads for a long time and they will not even know where it came from.  Surprise!  It came from this little witty book.  :-)'

#### Remove Stopwords

In [19]:
stop = set(stopwords.words("english"))

final_data["Text"].values[0]

"this witty little book makes my son laugh at loud. i recite it in the car as we're driving along and he always can sing the refrain. he's learned about whales, India, drooping roses:  i love all the new words this book  introduces and the silliness of it all.  this is a classic book i am  willing to bet my son will STILL be able to recite from memory when he is  in college"

In [154]:
# remove stopwords
def clean_stopword(sentense):
    sentense = " ".join(word.lower() for word in sentense.split() if word.lower() not in stop)
    return sentense

clean_stopword(final_data["Text"].values[0])

"witty little book makes son laugh loud. recite car we're driving along always sing refrain. he's learned whales, india, drooping roses: love new words book introduces silliness all. classic book willing bet son still able recite memory college"

####  Remove Websites links

In [None]:
# website links : http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+
index = 0
for sent in final_data["Text"]:
    if(len(re.findall("http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+", sent))):
        print(index, " ===== ", sent)
        break
    index += 1

In [23]:
# remove website links
def cleanlinks(sentense):
    pattern = "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
    sentense = re.sub(pattern, " ", sentense)
    return sentense

cleanlinks(final_data["Text"].values[85])

'I was intially introduced to Pro-Treat Beef Liver Freeze Dried Dog Treats at my veterinarian\'s office.  Rudy, our Jack Russell Terrior, usually not all that interested in dog treats, responded like I\'d never seen before!<br /><br />I bought a couple of cans (they are pretty pricey), but gave them to him sparingly. The tricks he\'d do for the Pro-Treats were really entertaining.  Now, Rudy has to take meds. for his glaucoma and thank heavens I found the large size here which is a good value, so he not only takes his medicine, but gives great tricks in return each day! He\'s a happy camper, despite the loss of sight in one eye.<br /><a href=" ">Pro-Treat Beef Liver, Freeze Dried Dog Treats, 21 Ounce</a><br /><br />Rudy says, "They are the best part of my day!"  We think so too!  ~ CJ'

#### Remove Numbers 

In [90]:
# Remove Numbers : \S*\d\S*
index = 0
for sent in final_data["Text"]:
    if(len(re.findall("\S*\d\S*", sent))):
        print(index, " ===== \n", sent)
        break
    index += 1

6  ===== 
 I set aside at least an hour each day to read to my son (3 y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to 1.5 y/o old to 4+.<br /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best.


In [91]:
# remove numbers
def cleannumbers(sentense):
    pattern = "\S*\d\S*"
    sentense = re.sub(pattern, " ", sentense)
    return sentense

cleannumbers(final_data["Text"].values[6])

"I set aside at least an hour each day to read to my son   y/o). At this point, I consider myself a connoisseur of children's books and this is one of the best. Santa Clause put this under the tree. Since then, we've read it perpetually and he loves it.<br /><br />First, this book taught him the months of the year.<br /><br />Second, it's a pleasure to read. Well suited to   y/o old to   /><br />Very few children's books are worth owning. Most should be borrowed from the library. This book, however, deserves a permanent spot on your shelf. Sendak's best."

#### Final Changes 

In [92]:
processed_reviews = []
for sentense in tqdm(final_data["Text"]):
    sentense = cleanhtml(sentense)
    sentense = cleanpunc(sentense)
    sentense = decontracted(sentense)
    sentense = clean_stopword(sentense)
    sentense = cleanlinks(sentense)
    sentense = cleannumbers(sentense)
    
    processed_reviews.append(sentense.strip())

100%|███████████████████████████████████████████████████████████████████████| 364171/364171 [00:28<00:00, 12591.85it/s]


In [93]:
# add processed_reviews into final_data
final_data["CleanedText"] = processed_reviews

In [94]:
# Store this DataFrame into SQLITE database

con = sqlite3.connect("dataset/database.sqlite")

final_data.to_sql("Cleaned_Reviews", con, schema=None, if_exists="replace")

## BOW

In [100]:
count_vect = CountVectorizer()
final_vectors = count_vect.fit(final_data["CleanedText"].values)

"Some Features Names", final_vectors.get_feature_names()[:10]

('Some Features Names',
 ['aa',
  'aaa',
  'aaaa',
  'aaaaa',
  'aaaaaa',
  'aaaaaaaaaaa',
  'aaaaaaaaaaaa',
  'aaaaaaaaaaaaa',
  'aaaaaaaaaaaaaa',
  'aaaaaaaaaaaaaaa'])

In [103]:
final_vectors = count_vect.transform(final_data["CleanedText"].values)
type(final_vectors)
final_vectors.get_shape()

scipy.sparse.csr.csr_matrix

(364171, 103093)

## Bi-grams, N-grams 

In [148]:
positive_words = [word for sentense in final_data[final_data["Score"]==1]["CleanedText"] for word in sentense.split()]
len(positive_words)

positive_word_freq = nltk.FreqDist(positive_words)
print("Most Common Frequent Positive Words :>>> ", positive_word_freq.most_common(20))

11736030

Most Common Frequent Positive Words :>>>  [('like', 124893), ('good', 110874), ('great', 103776), ('one', 89440), ('taste', 83861), ('tea', 76097), ('product', 75650), ('coffee', 75416), ('flavor', 74811), ('love', 74109), ('would', 72835), ('get', 56169), ('food', 52842), ('really', 52806), ('use', 51314), ('amazon', 49880), ('also', 48132), ('much', 47910), ('best', 47788), ('time', 46525)]


In [147]:
negative_words = [word for sentense in final_data[final_data["Score"]==0]["CleanedText"] for word in sentense.split()]
len(negative_words)

negative_word_freq = nltk.FreqDist(negative_words)
print("Most Common Frequent Negative Words :>>> ", negative_word_freq.most_common(20))

2372745

Most Common Frequent Negative Words :>>>  [('like', 30319), ('product', 23660), ('would', 23348), ('taste', 22601), ('one', 18977), ('good', 14762), ('coffee', 14189), ('flavor', 14098), ('get', 11024), ('even', 10945), ('food', 10850), ('tea', 10717), ('amazon', 10131), ('buy', 10086), ('much', 9465), ('really', 9414), ('could', 8695), ('box', 8328), ('tried', 8094), ('time', 7893)]


In [155]:
# observation
# like word is most frequently in both type of reviews
# because here we only used uni-gram and "not like" word may be used into negative review 
# but because of uni gram we extrace "like" from negative review also

# and we remove "not" from reviews using stopwords
# so fisrt we keep "stop" into reviews and again clean all reviews

stop = set(stopwords.words("english"))
stop.remove("not")

processed_reviews = []
for sentense in tqdm(final_data["Text"]):
    sentense = cleanhtml(sentense)
    sentense = cleanpunc(sentense)
    sentense = decontracted(sentense)
    sentense = clean_stopword(sentense)
    sentense = cleanlinks(sentense)
    sentense = cleannumbers(sentense)
    
    processed_reviews.append(sentense.strip())

100%|███████████████████████████████████████████████████████████████████████| 364171/364171 [00:29<00:00, 12472.14it/s]


In [156]:
# add processed_reviews into final_data
final_data["include_not_word"] = processed_reviews

In [158]:
# Store this DataFrame into SQLITE database

con = sqlite3.connect("dataset/database.sqlite")
final_data.to_sql("Cleaned_Reviews", con, schema=None, if_exists="replace")

In [159]:
count_vect = CountVectorizer(ngram_range=(1,2))
final_vectors = count_vect.fit(final_data["include_not_word"].values)

"Some Features Names", final_vectors.get_feature_names()[:10]

('Some Features Names',
 ['aa',
  'aa actually',
  'aa amazon',
  'aa aroma',
  'aa batteries',
  'aa battery',
  'aa beans',
  'aa big',
  'aa brand',
  'aa caffene'])

In [160]:
final_vectors = count_vect.transform(final_data["include_not_word"].values)
type(final_vectors)
final_vectors.get_shape()

scipy.sparse.csr.csr_matrix

(364171, 3950422)

## tf-idf 

In [198]:
tf_idf_vec = TfidfVectorizer(ngram_range=(1,2))
final_vectors = tf_idf_vec.fit(final_data["include_not_word"].values)

In [199]:
features = final_vectors.get_feature_names()
print("Some Features Names", features[:10])

final_vectors = count_vect.transform(final_data["include_not_word"].values)
type(final_vectors)
final_vectors.get_shape()

Some Features Names ['aa', 'aa actually', 'aa amazon', 'aa aroma', 'aa batteries', 'aa battery', 'aa beans', 'aa big', 'aa brand', 'aa caffene']


scipy.sparse.csr.csr_matrix

(364171, 3950422)

In [189]:
# 3rd Row means for 3rd review
x = final_vectors[3, :].toarray()[0]
x
np.unique(x)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

array([0, 1, 2, 3], dtype=int64)

In [201]:
# get top tf-idf features
def top_tfidf_features(row, features, top_n=25):
    top_n_features = np.argsort(row)[::-1][:25]
    top_n_features = [(features[i], row[i]) for i in top_n_features]
    df = pd.DataFrame(top_n_features, columns=["Feature", "tfidf"])
    return df

# let's get top features for 3rd review
x = final_vectors[3, :].toarray()[0]
top_tfidf_features(x, features, 25)

Unnamed: 0,Feature,tfidf
0,little,3
1,well,2
2,like,2
3,go,2
4,came,2
5,book,2
6,know,2
7,ones like,1
8,great,1
9,witty,1


## Word2Vec 

In [39]:
# Train our own Word2Vec using our own data corpus

list_of_sentenses = []
for sentense in final_data["CleanedText"]:
    list_of_sentenses.append(sentense.split())

In [40]:
w2v = Word2Vec(list_of_sentenses, min_count=5, size=50, workers=4)

In [218]:
w2v.wv.most_similar("like")

[('weird', 0.7784483432769775),
 ('okay', 0.7326413989067078),
 ('ok', 0.7103697061538696),
 ('good', 0.6892138719558716),
 ('gross', 0.687997043132782),
 ('yucky', 0.6782313585281372),
 ('resemble', 0.6759936213493347),
 ('kind', 0.673346757888794),
 ('strange', 0.6602465510368347),
 ('odd', 0.6582733392715454)]

In [219]:
w2v.wv.most_similar("great")

[('fantastic', 0.8846423625946045),
 ('terrific', 0.8666932582855225),
 ('awesome', 0.8664771914482117),
 ('good', 0.8627448081970215),
 ('excellent', 0.8490008115768433),
 ('wonderful', 0.8131095170974731),
 ('perfect', 0.7790266275405884),
 ('nice', 0.7610616087913513),
 ('amazing', 0.7420182824134827),
 ('fabulous', 0.7315464615821838)]

In [220]:
w2v.wv.most_similar("worst")

[('nastiest', 0.8647022247314453),
 ('greatest', 0.7747658491134644),
 ('disgusting', 0.7339673042297363),
 ('best', 0.7146674394607544),
 ('weakest', 0.7022820711135864),
 ('terrible', 0.7001622915267944),
 ('horrid', 0.6941112875938416),
 ('horrible', 0.6799733638763428),
 ('tastiest', 0.6714720726013184),
 ('vile', 0.6614094972610474)]

In [18]:
w2v.wv.most_similar("tasty")

[('delicious', 0.8269699215888977),
 ('tastey', 0.8031918406486511),
 ('satisfying', 0.7969828844070435),
 ('yummy', 0.7885133028030396),
 ('filling', 0.7560304403305054),
 ('flavorful', 0.7488271594047546),
 ('surprisingly', 0.6680410504341125),
 ('hearty', 0.6585946083068848),
 ('delish', 0.6532188653945923),
 ('good', 0.6506571769714355)]

In [41]:
vocabulary = list(w2v.wv.vocab)
len(vocabulary)
print(vocabulary[:20])

34360

['witty', 'little', 'book', 'makes', 'son', 'laugh', 'loud', 'recite', 'car', 'driving', 'along', 'always', 'sing', 'refrain', 'learned', 'whales', 'india', 'drooping', 'roses', 'love']


## Average W2V and tf-idf weighted W2V 

### Average W2V 

In [7]:
final_vectors = []
for sentence in tqdm(list_of_sentenses):
    vector_sum = np.zeros(50)
    count = 0
    for word in sentence:
        if word in vocabulary:
            vec = w2v.wv[word]
            vector_sum += vec
            count += 1
    if count != 0:
        vector_sum /= count
    final_vectors.append(vector_sum)
print(len(sent_vectors))
print(len(sent_vectors[0]))

 51%|█████████████████████████████████████▏                                   | 185825/364171 [11:38<11:10, 266.06it/s]


KeyboardInterrupt: 

###  tf-idf weighted W2V

In [38]:
tfidf = TfidfVectorizer()
tfidf.fit(final_data["CleanedText"])

# we are converting a dictionary with word as a key, and the idf as a value
dictionary = dict(zip(tfidf.get_feature_names(), list(tfidf.idf_)))
dictionary

TfidfVectorizer()

{'aa': 9.487893448949146,
 'aaa': 10.4041841808233,
 'aaaa': 11.50279646949141,
 'aaaaa': 11.407486289687085,
 'aaaaaa': 12.706769273817345,
 'aaaaaaaaaaa': 13.11223438192551,
 'aaaaaaaaaaaa': 13.11223438192551,
 'aaaaaaaaaaaaa': 13.11223438192551,
 'aaaaaaaaaaaaaa': 13.11223438192551,
 'aaaaaaaaaaaaaaa': 13.11223438192551,
 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa': 13.11223438192551,
 'aaaaaaaaaaaaaaaaaaaargh': 13.11223438192551,
 'aaaaaaaaaaaaaaaaacccccccckkkkkk': 13.11223438192551,
 'aaaaaaaaagghh': 13.11223438192551,
 'aaaaaaah': 13.11223438192551,
 'aaaaaaahhhhhh': 13.11223438192551,
 'aaaaaaarrrrrggghhh': 13.11223438192551,
 'aaaaaah': 12.706769273817345,
 'aaaaaahhh': 13.11223438192551,
 'aaaaaahhhh': 13.11223438192551,
 'aaaaaahhhhh': 13.11223438192551,
 'aaaaaahhhhhyaaaaaa': 13.11223438192551,
 'aaaaaand': 13.11223438192551,
 'aaaaaawwwwwwwwww': 13.11223438192551,
 'aaaaah': 12.706769273817345,
 'aaaaahhhhhhhhhhhhhhhh': 13.11223438192551,
 'aaaaallll': 13.11223438192551,

In [42]:
final_vectors = []

for sentense in tqdm(list_of_sentenses):
    vector_sum = np.zeros(50)
    tfidf_sum = 0
    for word in sentense:
        if(word in vocabulary and word in dictionary):
            tfidf = (sentense.count(word)/len(sentense)) * dictionary[word]
            vector = w2v.wv[word]
            vector_sum += tfidf * vector
            tfidf_sum += tfidf
    if(tfidf_sum != 0):
        vector_sum /= tfidf_sum
    final_vectors.append(vector_sum)

  2%|█▏                                                                         | 5675/364171 [00:24<26:17, 227.20it/s]


KeyboardInterrupt: 

In [None]:
len(final_vectors)
len(final_vectors[0])