# amazon_food_review_analysis

In [1]:
import warnings
import numpy as np
import pandas as pd
import sqlite3
import seaborn as sn
import matplotlib.pyplot as plt
import nltk# natural language tool kit
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
import string
from tqdm import tqdm
from sklearn.metrics import roc_curve,auc
from nltk.stem.porter import PorterStemmer#stemming

In [2]:
warnings.filterwarnings("ignore")

In [3]:
#using sqlite to read data from data base:
con=sqlite3.connect('./amazon-fine-food-reviews/database.sqlite')
#filtering positive and negative data
filtered_data=pd.read_sql_query("""
SELECT *
FROM REVIEWS
WHERE SCORE !=3
""",con)
def partition(x):
    if x<3:
        return 'negative'
    return 'positive'
actualscore=filtered_data['Score']
positive_negative=actualscore.map(partition)
filtered_data['Score']=positive_negative

In [4]:
print(filtered_data.shape)
filtered_data.head(10)

(525814, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...
5,6,B006K2ZZ7K,ADT0SRK1MGOEU,Twoapennything,0,0,positive,1342051200,Nice Taffy,I got a wild hair for taffy and ordered this f...
6,7,B006K2ZZ7K,A1SP2KVKFXXRU1,David C. Sullivan,0,0,positive,1340150400,Great! Just as good as the expensive brands!,This saltwater taffy had great flavors and was...
7,8,B006K2ZZ7K,A3JRGQVEQN31IQ,Pamela G. Williams,0,0,positive,1336003200,"Wonderful, tasty taffy",This taffy is so good. It is very soft and ch...
8,9,B000E7L2R4,A1MZYO9TZK0BBI,R. James,1,1,positive,1322006400,Yay Barley,Right now I'm mostly just sprouting this so my...
9,10,B00171APVA,A21BT40VZCCYT4,Carol A. Reed,0,0,positive,1351209600,Healthy Dog Food,This is a very healthy dog food. Good for thei...


## Removing duplicate data:

In [5]:
display=pd.read_sql_query("""
SELECT *
FROM REVIEWS
WHERE SCORE !=3 AND USERID="AR5J8UI46CURR"
ORDER BY PRODUCTID
""",con)
display

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,78445,B000HDL1RQ,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
1,138317,B000HDOPYC,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
2,138277,B000HDOPYM,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
3,73791,B000HDOPZG,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...
4,155049,B000PAQ75C,AR5J8UI46CURR,Geetha Krishnan,2,2,5,1199577600,LOACKER QUADRATINI VANILLA WAFERS,DELICIOUS WAFERS. I FIND THAT EUROPEAN WAFERS ...


In [6]:
print(filtered_data.sort_values.__doc__)
print('********************************')
print(filtered_data.drop_duplicates.__doc__)


Sort by the values along either axis.

Parameters
----------
        by : str or list of str
            Name or list of names to sort by.

            - if `axis` is 0 or `'index'` then `by` may contain index
              levels and/or column labels.
            - if `axis` is 1 or `'columns'` then `by` may contain column
              levels and/or index labels.

            .. versionchanged:: 0.23.0

               Allow specifying index or column level names.
axis : {0 or 'index', 1 or 'columns'}, default 0
     Axis to be sorted.
ascending : bool or list of bool, default True
     Sort ascending vs. descending. Specify list for multiple sort
     orders.  If this is a list of bools, must match the length of
     the by.
inplace : bool, default False
     If True, perform operation in-place.
kind : {'quicksort', 'mergesort', 'heapsort'}, default 'quicksort'
     Choice of sorting algorithm. See also ndarray.np.sort for more
     information.  `mergesort` is the only stable algor

In [7]:
#sorting data
#inplace means to return the output after proforming task
sorted_data=filtered_data.sort_values('ProductId',axis=0,ascending=True)
#removing duplicate
final=filtered_data.drop_duplicates(subset={"UserId","ProfileName","Time","Text"},keep='first',inplace=False)
final.shape

(364173, 10)

In [8]:
final=final[final.HelpfulnessNumerator<=final.HelpfulnessDenominator]
final.shape

(364171, 10)

##  BOW

In [9]:
count_vect=CountVectorizer()
final_count=count_vect.fit_transform(final["Text"].values)

In [10]:
print(count_vect.__doc__)

Convert a collection of text documents to a matrix of token counts

    This implementation produces a sparse representation of the counts using
    scipy.sparse.csr_matrix.

    If you do not provide an a-priori dictionary and you do not use an analyzer
    that does some kind of feature selection then the number of features will
    be equal to the vocabulary size found by analyzing the data.

    Read more in the :ref:`User Guide <text_feature_extraction>`.

    Parameters
    ----------
    input : string {'filename', 'file', 'content'}, default='content'
        If 'filename', the sequence passed as an argument to fit is
        expected to be a list of filenames that need reading to fetch
        the raw content to analyze.

        If 'file', the sequence items must have a 'read' method (file-like
        object) that is called to fetch the bytes in memory.

        Otherwise the input is expected to be a sequence of items that
        can be of type string or byte.

    encodin

In [11]:
type(final_count)

scipy.sparse.csr.csr_matrix

### Pre processing of text:


In [12]:
import re
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer# break the sentence into word
#finding all html tag in the dat given
i=0
for sent in final['Text'].values:
    if(len(re.findall('<.*?>',sent))):
        print(i)
        print(sent)
        break;
    i+=1;

10
I don't know if it's the cactus or the tequila or just the unique combination of ingredients, but the flavour of this hot sauce makes it one of a kind!  We picked up a bottle once on a trip we were on and brought it back home with us and were totally blown away!  When we realized that we simply couldn't find it anywhere in our city we were bummed.<br /><br />Now, because of the magic of the internet, we have a case of the sauce and are ecstatic because of it.<br /><br />If you love hot sauce..I mean really love hot sauce, but don't want a sauce that tastelessly burns your throat, grab a bottle of Tequila Picante Gourmet de Inclan.  Just realize that once you taste it, you will never want to use any other sauce.<br /><br />Thank you for the personal, incredible service!


In [13]:
print(nltk.stem.snowball.SnowballStemmer.__doc__)


    Snowball Stemmer

    The following languages are supported:
    Arabic, Danish, Dutch, English, Finnish, French, German,
    Hungarian, Italian, Norwegian, Portuguese, Romanian, Russian,
    Spanish and Swedish.

    The algorithm for English is documented here:

        Porter, M. "An algorithm for suffix stripping."
        Program 14.3 (1980): 130-137.

    The algorithms have been developed by Martin Porter.
    These stemmers are called Snowball, because Porter created
    a programming language with this name for creating
    new stemming algorithms. There is more information available
    at http://snowball.tartarus.org/

    The stemmer is invoked as shown below:

    >>> from nltk.stem import SnowballStemmer
    >>> print(" ".join(SnowballStemmer.languages)) # See which languages are supported
    arabic danish dutch english finnish french german hungarian
    italian norwegian porter portuguese romanian russian
    spanish swedish
    >>> stemmer = SnowballStemmer("germ

In [14]:
nltk.download('stopwords')
stop=set(stopwords.words('english'))
#initialising the snow ball stemmer
sno=nltk.stem.snowball.SnowballStemmer('english')#derive root word 
def cleanhtml(sentence):
    cleanr=re.compile('<.*?>')
    cleantext=re.sub(cleanr,' ',sentence)
    return cleantext
def cleanpunc(sentence):
    cleaned =re.sub(r'[?|!|\'|"|#]',r'',sentence)
    cleaned=re.sub(r'[.|,|)|(|\|/]',r'',sentence)
    return cleaned
stop   

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/mahaseth_rahul/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [15]:
#code for implenting step-by-step the checks mentioned in pre-processing
from tqdm import tqdm
i=0
str1=' '
final_string=[]
all_positive_words=[]
all_negative_words=[]
s=''
for sent in tqdm(final['Text'].values):
    filtered_setentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        #print(sent.split())
        for cleaned_word in cleanpunc(w).split():
            #print(cleanpunc(w).split())
            if ((cleaned_word.isalpha())&(len(cleaned_word)>2)):              
                if(cleaned_word.lower() not in stop):
                    s=(sno.stem(cleaned_word.lower())).encode('utf8')
                    filtered_sentence.append(s)
                    if(final['Score'].values)[i]=='positive':
                        all_positive_words.append(s)
                    if(final["Score"].values)[i]=='negative':
                        all_negative_words.append(s)
                else:
                    continue
            else:
                continue
    str1=b" ".join(filtered_sentence)
    final_string.append(str1)
    i+=1
                 


In [31]:
# https://stackoverflow.com/a/47091490/4084039
import re

def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [32]:
# https://gist.github.com/sebleier/554280
# we are removing the words from the stop words list: 'no', 'nor', 'not'
# <br /><br /> ==> after the above steps, we are getting "br br"
# we are including them into stop words list
# instead of <br /> if we have <br/> these tags would have revmoved in the 1st step

stopwords= set(['br', 'the', 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've",\
            "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', \
            'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their',\
            'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', \
            'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', \
            'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', \
            'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after',\
            'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further',\
            'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',\
            'most', 'other', 'some', 'such', 'only', 'own', 'same', 'so', 'than', 'too', 'very', \
            's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', \
            've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn',\
            "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn',\
            "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", \
            'won', "won't", 'wouldn', "wouldn't"])

In [40]:
#print(final["Text"].values)
from tqdm import tqdm
from bs4 import BeautifulSoup   
preprocessed_reviewed_text=[]
for sentence in tqdm(final["Text"].values):
    sentence=re.sub(r"http\S+","",sentence)#remove all the url from the rex
    sentence=BeautifulSoup(sentence,"lxml").get_text()
    sentence=decontracted(sentence)
    sentence=re.sub("\S*\d\S*","",sentence).strip()
    sentence=re.sub('[^A-Za-z]+'," ",sentence)
    #print(sentence)
    sentence = ' '.join(e.lower() for e in sentence.split() if e.lower() not in stopwords)
    #print(sentence)
    preprocessed_reviewed_text.append(sentence.strip())


100%|██████████| 364171/364171 [02:17<00:00, 2657.28it/s]


In [41]:
preprocessed_reviewed_text[0:8]

['bought several vitality canned dog food products found good quality product looks like stew processed meat smells better labrador finicky appreciates product better',
 'product arrived labeled jumbo salted peanuts peanuts actually small sized unsalted not sure error vendor intended represent product jumbo',
 'confection around centuries light pillowy citrus gelatin nuts case filberts cut tiny squares liberally coated powdered sugar tiny mouthful heaven not chewy flavorful highly recommend yummy treat familiar story c lewis lion witch wardrobe treat seduces edmund selling brother sisters witch',
 'looking secret ingredient robitussin believe found got addition root beer extract ordered good made cherry soda flavor medicinal',
 'great taffy great price wide assortment yummy taffy delivery quick taffy lover deal',
 'got wild hair taffy ordered five pound bag taffy enjoyable many flavors watermelon root beer melon peppermint grape etc complaint bit much red black licorice flavored pieces

In [43]:
final['CleanedText']=preprocessed_reviewed_text
#adding a column of filter dat



In [44]:
final.head(5)

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,CleanedText
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,positive,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,negative,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanuts p...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,positive,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,confection around centuries light pillowy citr...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,negative,1307923200,Cough Medicine,If you are looking for the secret ingredient i...,looking secret ingredient robitussin believe f...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,positive,1350777600,Great taffy,Great taffy at a great price. There was a wid...,great taffy great price wide assortment yummy ...


In [46]:
conn=sqlite3.connect('final.sqlite')
print(conn.__doc__)
c=conn.cursor()
print(c.__doc__)
conn.text_factory= str
final.to_sql('Reviews',conn,schema=None, if_exists='replace')
print(final.to_sql.__doc__)

SQLite database connection object.
SQLite database cursor class.

        Write records stored in a DataFrame to a SQL database.

        Databases supported by SQLAlchemy [1]_ are supported. Tables can be
        newly created, appended to, or overwritten.

        Parameters
        ----------
        name : str
            Name of SQL table.
        con : sqlalchemy.engine.(Engine or Connection) or sqlite3.Connection
            Using SQLAlchemy makes it possible to use any DB supported by that
            library. Legacy support is provided for sqlite3.Connection objects. The user
            is responsible for engine disposal and connection closure for the SQLAlchemy
            connectable See `here                 <https://docs.sqlalchemy.org/en/13/core/connections.html>`_.

        schema : str, optional
            Specify the schema (if database flavor supports this). If None, use
            default schema.
        if_exists : {'fail', 'replace', 'append'}, default 'fail'
  

### Bi-grams and n-grams

In [23]:
dir(nltk)


['AbstractLazySequence',
 'AffixTagger',
 'AlignedSent',
 'Alignment',
 'AnnotationTask',
 'ApplicationExpression',
 'Assignment',
 'BigramAssocMeasures',
 'BigramCollocationFinder',
 'BigramTagger',
 'BinaryMaxentFeatureEncoding',
 'BlanklineTokenizer',
 'BllipParser',
 'BottomUpChartParser',
 'BottomUpLeftCornerChartParser',
 'BottomUpProbabilisticChartParser',
 'Boxer',
 'BrillTagger',
 'BrillTaggerTrainer',
 'CFG',
 'CRFTagger',
 'CfgReadingCommand',
 'ChartParser',
 'ChunkParserI',
 'ChunkScore',
 'Cistem',
 'ClassifierBasedPOSTagger',
 'ClassifierBasedTagger',
 'ClassifierI',
 'ConcordanceIndex',
 'ConditionalExponentialClassifier',
 'ConditionalFreqDist',
 'ConditionalProbDist',
 'ConditionalProbDistI',
 'ConfusionMatrix',
 'ContextIndex',
 'ContextTagger',
 'ContingencyMeasures',
 'CoreNLPDependencyParser',
 'CoreNLPParser',
 'Counter',
 'CrossValidationProbDist',
 'DRS',
 'DecisionTreeClassifier',
 'DefaultTagger',
 'DependencyEvaluator',
 'DependencyGrammar',
 'DependencyGrap

In [24]:
print(nltk.FreqDist.__doc__)


    A frequency distribution for the outcomes of an experiment.  A
    frequency distribution records the number of times each outcome of
    an experiment has occurred.  For example, a frequency distribution
    could be used to record the frequency of each word type in a
    document.  Formally, a frequency distribution can be defined as a
    function mapping from each sample to the number of times that
    sample occurred as an outcome.

    Frequency distributions are generally constructed by running a
    number of experiments, and incrementing the count for a sample
    every time it is an outcome of an experiment.  For example, the
    following code will produce a frequency distribution that encodes
    how often each word occurs in a text:

        >>> from nltk.tokenize import word_tokenize
        >>> from nltk.probability import FreqDist
        >>> sent = 'This is an example sentence'
        >>> fdist = FreqDist()
        >>> for word in word_tokenize(sent):
        ...

In [25]:
freq_dist_positive=nltk.FreqDist(all_positive_words)
freq_dist_negative=nltk.FreqDist(all_negative_words)
print("most common positive words:",freq_dist_positive.most_common(20))
print("most_common negative words:",freq_dist_negative.most_common(20))

most common positive words: [(b'like', 138528), (b'tast', 126157), (b'good', 107583), (b'love', 106314), (b'flavor', 106291), (b'use', 103249), (b'great', 98290), (b'one', 94766), (b'product', 86413), (b'tri', 85387), (b'tea', 80631), (b'coffe', 75774), (b'make', 74686), (b'get', 71758), (b'food', 62463), (b'would', 55400), (b'time', 53612), (b'buy', 53479), (b'realli', 52432), (b'eat', 51179)]
most_common negative words: [(b'tast', 33878), (b'like', 32139), (b'product', 27341), (b'one', 20206), (b'flavor', 18754), (b'would', 17929), (b'tri', 17642), (b'use', 15173), (b'good', 14597), (b'coffe', 14188), (b'get', 13734), (b'buy', 13563), (b'order', 12739), (b'food', 12286), (b'tea', 11254), (b'even', 11037), (b'box', 10517), (b'make', 9806), (b'time', 9580), (b'bag', 9459)]


In [26]:
#bi gram,tri gram n gram
count_vect=CountVectorizer(ngram_range=(1,2))
final_bigram_count=count_vect.fit_transform(final['Text'].values)

In [27]:
final_bigram_count.get_shape()

(364171, 2910192)

### TF_IDF

In [28]:
print(TfidfVectorizer.__doc__)

Convert a collection of raw documents to a matrix of TF-IDF features.

    Equivalent to :class:`CountVectorizer` followed by
    :class:`TfidfTransformer`.

    Read more in the :ref:`User Guide <text_feature_extraction>`.

    Parameters
    ----------
    input : str {'filename', 'file', 'content'}
        If 'filename', the sequence passed as an argument to fit is
        expected to be a list of filenames that need reading to fetch
        the raw content to analyze.

        If 'file', the sequence items must have a 'read' method (file-like
        object) that is called to fetch the bytes in memory.

        Otherwise the input is expected to be a sequence of items that
        can be of type string or byte.

    encoding : str, default='utf-8'
        If bytes or files are given to analyze, this encoding is used to
        decode.

    decode_error : {'strict', 'ignore', 'replace'} (default='strict')
        Instruction on what to do if a byte sequence is given to analyze that


In [29]:
print(TfidfTransformer.__doc__)

Transform a count matrix to a normalized tf or tf-idf representation

    Tf means term-frequency while tf-idf means term-frequency times inverse
    document-frequency. This is a common term weighting scheme in information
    retrieval, that has also found good use in document classification.

    The goal of using tf-idf instead of the raw frequencies of occurrence of a
    token in a given document is to scale down the impact of tokens that occur
    very frequently in a given corpus and that are hence empirically less
    informative than features that occur in a small fraction of the training
    corpus.

    The formula that is used to compute the tf-idf for a term t of a document d
    in a document set is tf-idf(t, d) = tf(t, d) * idf(t), and the idf is
    computed as idf(t) = log [ n / df(t) ] + 1 (if ``smooth_idf=False``), where
    n is the total number of documents in the document set and df(t) is the
    document frequency of t; the document frequency is the number of do

In [30]:
tf_idf_vect=TfidfVectorizer(ngram_range=(1,2))
final_td_idf=tf_idf_vect.fit_transform(final['Text'].values)

In [31]:
final_td_idf.get_shape()

(364171, 2910192)

In [32]:
feature=tf_idf_vect.get_feature_names()
len(feature)

2910192

In [33]:
feature[100000:100015]

['ales until',
 'ales ve',
 'ales would',
 'ales you',
 'alessandra',
 'alessandra ambrosia',
 'alessi',
 'alessi added',
 'alessi also',
 'alessi and',
 'alessi are',
 'alessi at',
 'alessi brand',
 'alessi breadsticks',
 'alessi caffe']

In [34]:
print(tf_idf_vect.__doc__)

Convert a collection of raw documents to a matrix of TF-IDF features.

    Equivalent to :class:`CountVectorizer` followed by
    :class:`TfidfTransformer`.

    Read more in the :ref:`User Guide <text_feature_extraction>`.

    Parameters
    ----------
    input : str {'filename', 'file', 'content'}
        If 'filename', the sequence passed as an argument to fit is
        expected to be a list of filenames that need reading to fetch
        the raw content to analyze.

        If 'file', the sequence items must have a 'read' method (file-like
        object) that is called to fetch the bytes in memory.

        Otherwise the input is expected to be a sequence of items that
        can be of type string or byte.

    encoding : str, default='utf-8'
        If bytes or files are given to analyze, this encoding is used to
        decode.

    decode_error : {'strict', 'ignore', 'replace'} (default='strict')
        Instruction on what to do if a byte sequence is given to analyze that


In [35]:
type(final_td_idf)
print(final_td_idf[3,:].toarray()[0])#list of array
final_td_idf.shape

[0. 0. 0. ... 0. 0. 0.]


(364171, 2910192)

In [36]:
def top_tfidf_feats(row, features, top_n=25):
    '''get top n tfidf values in row and return them with their corresponding'''
    topn_ids= np.argsort(row)[::-1][:top_n]#this function sort the data ince nd then change to dec and select top 25 val
    top_feats=[(feature[i],row[i]) for i in topn_ids]
    df=pd.DataFrame(top_feats)
    df.columns=['features','tfidf']
    return df
top_ifidf=top_tfidf_feats(final_td_idf[1,:].toarray()[0],feature,25)


In [37]:
top_ifidf

Unnamed: 0,features,tfidf
0,as jumbo,0.390489
1,jumbo,0.260971
2,unsalted not,0.201475
3,jumbo salted,0.201475
4,vendor intended,0.201475
5,sized unsalted,0.201475
6,arrived labeled,0.187395
7,peanuts,0.186777
8,actually small,0.184594
9,error or,0.176745


In [38]:
np.argsort(final_td_idf[1,:].toarray()[0])[::-1][:25]
#this function sort the data incen then change to dec and select top 25 val
    

array([ 208746, 1366174, 2700457, 1366261, 2741354, 2270266,  202002,
       1881462,   67006,  860270, 2105227, 2616754, 2162281, 2283613,
       1881968, 1881910,  136880, 2105180, 2803448, 2530577, 1401864,
       1305606, 2700388, 2538834,  860179])

###  WORD2VE

In [39]:
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import pickle
import gensim

unable to import 'smart_open.gcs', disabling that module


In [40]:
print(Word2Vec.__doc__)
print(KeyedVectors.__doc__)


Train, use and evaluate neural networks described in https://code.google.com/p/word2vec/.

    Once you're finished training a model (=no more updates, only querying)
    store and use only the :class:`~gensim.models.keyedvectors.KeyedVectors` instance in `self.wv` to reduce memory.

    The model can be stored/loaded via its :meth:`~gensim.models.word2vec.Word2Vec.save` and
    :meth:`~gensim.models.word2vec.Word2Vec.load` methods.

    The trained word vectors can also be stored/loaded from a format compatible with the
    original word2vec implementation via `self.wv.save_word2vec_format`
    and :meth:`gensim.models.keyedvectors.KeyedVectors.load_word2vec_format`.

    Some important attributes are the following:

    Attributes
    ----------
    wv : :class:`~gensim.models.keyedvectors.Word2VecKeyedVectors`
        This object essentially contains the mapping between words and embeddings. After training, it can be used
        directly to query those embeddings in various ways. S

In [41]:
model =KeyedVectors.load_word2vec_format("GoogleNews-vectors-negative300.bin",binary=True)


In [42]:
model.wv['computer']

array([ 1.07421875e-01, -2.01171875e-01,  1.23046875e-01,  2.11914062e-01,
       -9.13085938e-02,  2.16796875e-01, -1.31835938e-01,  8.30078125e-02,
        2.02148438e-01,  4.78515625e-02,  3.66210938e-02, -2.45361328e-02,
        2.39257812e-02, -1.60156250e-01, -2.61230469e-02,  9.71679688e-02,
       -6.34765625e-02,  1.84570312e-01,  1.70898438e-01, -1.63085938e-01,
       -1.09375000e-01,  1.49414062e-01, -4.65393066e-04,  9.61914062e-02,
        1.68945312e-01,  2.60925293e-03,  8.93554688e-02,  6.49414062e-02,
        3.56445312e-02, -6.93359375e-02, -1.46484375e-01, -1.21093750e-01,
       -2.27539062e-01,  2.45361328e-02, -1.24511719e-01, -3.18359375e-01,
       -2.20703125e-01,  1.30859375e-01,  3.66210938e-02, -3.63769531e-02,
       -1.13281250e-01,  1.95312500e-01,  9.76562500e-02,  1.26953125e-01,
        6.59179688e-02,  6.93359375e-02,  1.02539062e-02,  1.75781250e-01,
       -1.68945312e-01,  1.21307373e-03, -2.98828125e-01, -1.15234375e-01,
        5.66406250e-02, -

In [1]:
model.wv.most_similar('women')

NameError: name 'model' is not defined

In [None]:
model.wv.most_similar('tasti')

In [None]:
model.wv.most_similar('tasty')

In [None]:
model.wv.similarity('man','women')

In [None]:
model.wv.similarity('tasty','taste')

In [None]:
#train own word2vec

i=0
list_of_sent=[]
for sent in final['Text'].values:
    filtered_sentence=[]
    sent=cleanhtml(sent)
    for w in sent.split():
        for cleaned_word in cleanpunc(w).split():
            if(cleaned_word.isalpha()):
                filtered_sentence.append(cleaned_word.lower())
            else:
                continue
    list_of_sent.append(filtered_sentence)

In [None]:
w2v_model=Word2Vec(list_of_sent,min_count=5,size=50,workers=4)
print(w2v_model.__doc__)

In [None]:
words=list(w2v_model.wv.vocab)
print(len(words))
words

In [None]:
w2v_model.wv.most_similar('tasty')

In [None]:
w2v_model.wv.most_similar('like')

###  AVG W2V, TFIDF-W2V

In [None]:
# AVERAGE WORD2VEC
#COMPUTE AVERAGE WORD2VEC FOR EACH REVIEW
sent_vectors=[];
for sent in list_of_sent:
    sent_vec=np.zeros(50)
    #print(sent_vec)
    cnt_word=0
    for word in sent:
        try:
            vec=w2v_model.ww[word]
            sent_vec+=vec
            cnt_word+=1
        except:
            pass
    sent_vec/=cnt_word
    sent_vectors.append(sent_vec)
print(len(sent_vectors))
print(len(sent_vectors[0]))

In [None]:
#TF-IDF weighted word2vec
tfidf_feat=tf_idf_vect.get_feature_names()
tfidf_Sent_vectors=[]
row=0;
for sent in list_of_sent:
    sent_vec=np.zeros(50)
    weight_sum=0;
    for word in sent:
        try:
            vec=w2v_model.wv[word]# this will return vector corresponding to word
            tfidf=final_td_idf[rows,tfidf_feat.index(word)]#this will extract the tfidf value correspong to word 
            sent_vec+= (vec*tfidf)
            weight_sum+=tf_idf
        except:
            pass
    sent_vec/=weight_sum
    tfidf_Sent_vectors.append(sent_vec)
    row+=1
print(len(sent_vectors))
print(len(sent_vectors[0]))
                               
    