In [1]:
import pandas as pd
import numpy as np
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

from sklearn.metrics import accuracy_score

from nltk import download
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
sia = SentimentIntensityAnalyzer()

In [3]:
def clean_words(string, leng = "english"):
    
    stop_words = stopwords.words(leng)
    stop_words.extend(["-", ".", ",", ";", "(", ")"])
    
    words = word_tokenize(string)
       
    return [word.lower() for word in words if not word in stop_words and len(word) >= 3]

In [4]:
def sentimiento(df, col):
    result = {}
    for _, row in tqdm(df.iterrows(), total = len(df)):
        result[row["Id"]] = sia.polarity_scores(row[col])
        
    vader_result = pd.DataFrame(result).T
    vader_result = vader_result.reset_index().rename(columns = {"index": "Id"})
    df_result_end = pd.concat([df, vader_result], axis = 1).dropna()

    return df_result_end

In [5]:
def stop_words_clean(string, word_token = True, leng = "english"):
    
    stop_words = stopwords.words(leng)
    stop_words.extend(["-", ".", ",", ";", "(", ")"])
    
    if word_token == True:
        words = word_tokenize(string)
    else:
        words = sent_tokenize(string)
        
    result = [word.lower() for word in words if not word in stop_words and len(word) >= 3] 
    
    result_end = " ".join(result)
    
    return result_end

In [6]:
def string_stemmer(words_list):
    stemmer=PorterStemmer()
    stemmer_words = [stemmer.stem(word) for word in words_list]
    return " ".join(stemmer_words)

In [7]:
def lemma(words):
    
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(word) for word in words]

    return " ".join(lemma_words)

In [8]:
def scale(value):
    if value > 0:
        return 5
    elif value == 0:
        return 3
    else:
        return 1

In [9]:
def predict(df):
    df['Predict'] = np.vectorize(scale)(df['compound'])
    df = df[(df["Score"] != 2) & (df["Score"] != 4)]
    
    return "El accurracy es: {}".format(accuracy_score(df["Score"], df["Predict"]))

In [10]:
download(["names", "stopwords", "vader_lexicon"])

[nltk_data] Downloading package names to
[nltk_data]     C:\Users\juan.avendano\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\juan.avendano\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\juan.avendano\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [11]:
df_amazon = pd.read_csv("Reviews.csv")
print(df_amazon.shape)
df_amazon.head(3)

(568454, 10)


Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...


In [12]:
df_amazon_new = df_amazon.iloc[0:10000, :]
df_amazon_new.shape

(10000, 10)

In [13]:
methods = {'lemma' : lemma, 
           'stemmer': string_stemmer, 
           'stop': stop_words_clean}

In [14]:
for key, value in methods.items():
    if key == 'stop':
        df_amazon_new["Text_"+key] = df_amazon_new["Text"].apply(lambda x: value(string = x, 
                                                                                word_token = True, 
                                                                                leng = "english"))
    else:
        df_amazon_new["Text_"+key] = df_amazon_new["Text"].apply(lambda x: value(clean_words(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_"+key] = df_amazon_new["Text"].apply(lambda x: value(clean_words(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_"+key] = df_amazon_new["Text"].apply(lambda x: value(clean_words(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_"+key

In [15]:
df_result_end = sentimiento(df_amazon_new, "Text_lemma")
df_result_end.head(3)

  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Text_lemma,Text_stemmer,Text_stop,Id.1,neg,neu,pos,compound
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,bought sever vital can dog food product found ...,bought several vitality canned dog food produc...,1,0.0,0.517,0.483,0.9413
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanut .....,product arriv label jumbo salt peanut ... pean...,product arrived labeled jumbo salted peanuts ....,2,0.088,0.81,0.102,0.0762
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,this confection around century light pillowy c...,thi confect around centuri light pillowi citru...,this confection around centuries light pillowy...,3,0.115,0.631,0.254,0.8624


In [16]:
print(predict(df_result_end) + f" en lemmatizer")

El accurracy es: 0.7937821236053654 en lemmatizer


In [17]:
df_result_end = sentimiento(df_amazon_new, "Text_stop")
df_result_end.head(3)

  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Text_lemma,Text_stemmer,Text_stop,Id.1,neg,neu,pos,compound
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,bought sever vital can dog food product found ...,bought several vitality canned dog food produc...,1,0.0,0.517,0.483,0.9413
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanut .....,product arriv label jumbo salt peanut ... pean...,product arrived labeled jumbo salted peanuts ....,2,0.088,0.81,0.102,0.0762
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,this confection around century light pillowy c...,thi confect around centuri light pillowi citru...,this confection around centuries light pillowy...,3,0.149,0.602,0.249,0.8073


In [18]:
print(predict(df_result_end) + f" en combinación stop words")

El accurracy es: 0.7944089256612762 en combinación stop words


In [19]:
df_result_end = sentimiento(df_amazon_new, "Text_stemmer")
df_result_end.head(3)

  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Text_lemma,Text_stemmer,Text_stop,Id.1,neg,neu,pos,compound
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,bought sever vital can dog food product found ...,bought several vitality canned dog food produc...,1,0.0,0.586,0.414,0.9081
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanut .....,product arriv label jumbo salt peanut ... pean...,product arrived labeled jumbo salted peanuts ....,2,0.088,0.81,0.102,0.0762
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,this confection around century light pillowy c...,thi confect around centuri light pillowi citru...,this confection around centuries light pillowy...,3,0.122,0.686,0.192,0.6249


In [20]:
print(predict(df_result_end) + f" en combinación stemmer")

El accurracy es: 0.7668296352012035 en combinación stemmer


In [21]:
df_amazon_new["Text_lemma_stop"] = df_amazon_new["Text"].apply(lambda x: stop_words_clean(string = x, 
                                                                                          word_token = True, 
                                                                                          leng = "english"))
df_amazon_new["Text_lemma_stop"] = df_amazon_new["Text_lemma_stop"].apply(lambda x: lemma(clean_words(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_lemma_stop"] = df_amazon_new["Text"].apply(lambda x: stop_words_clean(string = x,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_lemma_stop"] = df_amazon_new["Text_lemma_stop"].apply(lambda x: lemma(clean_words(x)))


In [22]:
df_result_end = sentimiento(df_amazon_new, "Text_lemma_stop")
df_result_end.head(3)

  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Text_lemma,Text_stemmer,Text_stop,Text_lemma_stop,Id.1,neg,neu,pos,compound
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,bought sever vital can dog food product found ...,bought several vitality canned dog food produc...,bought several vitality canned dog food produc...,1,0.0,0.503,0.497,0.9413
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanut .....,product arriv label jumbo salt peanut ... pean...,product arrived labeled jumbo salted peanuts ....,product arrived labeled jumbo salted peanut .....,2,0.123,0.773,0.105,-0.1027
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,this confection around century light pillowy c...,thi confect around centuri light pillowi citru...,this confection around centuries light pillowy...,confection around century light pillowy citrus...,3,0.13,0.584,0.286,0.8624


In [23]:
print(predict(df_result_end) + f" en combinación lemmatizer + stop_words")

El accurracy es: 0.7922777986711796 en combinación lemmatizer + stop_words


In [24]:
df_amazon_new["Text_lemma_stop_stemmer"] = df_amazon_new["Text"].apply(lambda x: stop_words_clean(string = x, 
                                                                                          word_token = True, 
                                                                                          leng = "english"))

df_amazon_new["Text_lemma_stop_stemmer"] = df_amazon_new["Text_lemma_stop_stemmer"].apply(lambda x: lemma(clean_words(x)))
df_amazon_new["Text_lemma_stop_stemmer"] = df_amazon_new["Text_lemma_stop_stemmer"].apply(lambda x: string_stemmer(clean_words(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_lemma_stop_stemmer"] = df_amazon_new["Text"].apply(lambda x: stop_words_clean(string = x,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_lemma_stop_stemmer"] = df_amazon_new["Text_lemma_stop_stemmer"].apply(lambda x: lemma(clean_words(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#retur

In [25]:
df_result_end = sentimiento(df_amazon_new, "Text_lemma_stop_stemmer")
df_result_end.head(3)

  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,Text_lemma,Text_stemmer,Text_stop,Text_lemma_stop,Text_lemma_stop_stemmer,Id.1,neg,neu,pos,compound
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,bought several vitality canned dog food produc...,bought sever vital can dog food product found ...,bought several vitality canned dog food produc...,bought several vitality canned dog food produc...,bought sever vital can dog food product found ...,1,0.0,0.573,0.427,0.9081
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,product arrived labeled jumbo salted peanut .....,product arriv label jumbo salt peanut ... pean...,product arrived labeled jumbo salted peanuts ....,product arrived labeled jumbo salted peanut .....,product arriv label jumbo salt peanut ... pean...,2,0.123,0.773,0.105,-0.1027
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,this confection around century light pillowy c...,thi confect around centuri light pillowi citru...,this confection around centuries light pillowy...,confection around century light pillowy citrus...,confect around centuri light pillowi citru gel...,3,0.138,0.643,0.218,0.6249


In [26]:
print(predict(df_result_end) + f" en combinación  lemmatizer + stop words + stemmer")

El accurracy es: 0.7668296352012035 en combinación  lemmatizer + stop words + stemmer


In [27]:
df_amazon_new["Text_stop_stemmer"] = df_amazon_new["Text"].apply(lambda x: stop_words_clean(string = x, 
                                                                                          word_token = True, 
                                                                                          leng = "english"))

df_amazon_new["Text_stop_stemmer"] = df_amazon_new["Text_stop_stemmer"].apply(lambda x: string_stemmer(clean_words(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_stop_stemmer"] = df_amazon_new["Text"].apply(lambda x: stop_words_clean(string = x,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_stop_stemmer"] = df_amazon_new["Text_stop_stemmer"].apply(lambda x: string_stemmer(clean_words(x)))


In [28]:
df_result_end = sentimiento(df_amazon_new, "Text_stop_stemmer")
df_result_end.head(3)

  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,...,Text_stemmer,Text_stop,Text_lemma_stop,Text_lemma_stop_stemmer,Text_stop_stemmer,Id.1,neg,neu,pos,compound
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,...,bought sever vital can dog food product found ...,bought several vitality canned dog food produc...,bought several vitality canned dog food produc...,bought sever vital can dog food product found ...,bought sever vital can dog food product found ...,1,0.0,0.573,0.427,0.9081
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,...,product arriv label jumbo salt peanut ... pean...,product arrived labeled jumbo salted peanuts ....,product arrived labeled jumbo salted peanut .....,product arriv label jumbo salt peanut ... pean...,product arriv label jumbo salt peanut ... pean...,2,0.123,0.773,0.105,-0.1027
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,...,thi confect around centuri light pillowi citru...,this confection around centuries light pillowy...,confection around century light pillowy citrus...,confect around centuri light pillowi citru gel...,confect around centuri light pillowi citru gel...,3,0.138,0.643,0.218,0.6249


In [29]:
print(predict(df_result_end) + f" en combinación  stop words + stemmer")

El accurracy es: 0.7668296352012035 en combinación  stop words + stemmer


In [30]:
df_amazon_new["Text_lemma_stemmer"] = df_amazon_new["Text"].apply(lambda x: lemma(clean_words(x)))
df_amazon_new["Text_lemma_stemmer"] = df_amazon_new["Text_lemma_stemmer"].apply(lambda x: string_stemmer(clean_words(x)))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_lemma_stemmer"] = df_amazon_new["Text"].apply(lambda x: lemma(clean_words(x)))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_amazon_new["Text_lemma_stemmer"] = df_amazon_new["Text_lemma_stemmer"].apply(lambda x: string_stemmer(clean_words(x)))


In [31]:
df_result_end = sentimiento(df_amazon_new, "Text_lemma_stemmer")
df_result_end.head(3)

  0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text,...,Text_stop,Text_lemma_stop,Text_lemma_stop_stemmer,Text_stop_stemmer,Text_lemma_stemmer,Id.1,neg,neu,pos,compound
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...,...,bought several vitality canned dog food produc...,bought several vitality canned dog food produc...,bought sever vital can dog food product found ...,bought sever vital can dog food product found ...,bought sever vital can dog food product found ...,1,0.0,0.573,0.427,0.9081
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,...,product arrived labeled jumbo salted peanuts ....,product arrived labeled jumbo salted peanut .....,product arriv label jumbo salt peanut ... pean...,product arriv label jumbo salt peanut ... pean...,product arriv label jumbo salt peanut ... pean...,2,0.123,0.773,0.105,-0.1027
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...,...,this confection around centuries light pillowy...,confection around century light pillowy citrus...,confect around centuri light pillowi citru gel...,confect around centuri light pillowi citru gel...,confect around centuri light pillowi citru gel...,3,0.138,0.643,0.218,0.6249


In [32]:
print(predict(df_result_end) + f" en combinación  lemmatizer + stemmer")

El accurracy es: 0.7668296352012035 en combinación  lemmatizer + stemmer
