<a href="https://colab.research.google.com/github/shrinik/infosys-colab-hackathon/blob/main/Review_Classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Add all import statements here
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

nltk.download([
    'punkt',
    'stopwords',
    'vader_lexicon'
])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


True

In [None]:
# Clean the dataset by stripping out invalid characters and creating a clean new file
with open('flipkart_product.csv', 'r', encoding="utf8", errors='ignore') as f1:
  f2 = open("flipkart_product_cleaned_up.csv", "w")
  # f1.read()
  f2.write(f1.read())
  f1.close()
  f2.close()
print("File clean up completed")

File clean up completed


In [None]:
df_input = pd.read_csv('flipkart_product_cleaned_up.csv')
df_input.head()

Unnamed: 0,ProductName,Price,Rate,Review,Summary
0,"Candes 12 L Room/Personal Air Cooler??(White, ...","??3,999",5,Super!,Great cooler.. excellent air flow and for this...
1,"Candes 12 L Room/Personal Air Cooler??(White, ...","??3,999",5,Awesome,Best budget 2 fit cooler. Nice cooling
2,"Candes 12 L Room/Personal Air Cooler??(White, ...","??3,999",3,Fair,The quality is good but the power of air is de...
3,"Candes 12 L Room/Personal Air Cooler??(White, ...","??3,999",1,Useless product,Very bad product it's a only a fan
4,"Candes 12 L Room/Personal Air Cooler??(White, ...","??3,999",3,Fair,Ok ok product


In [None]:
df = df_input[['Rate', 'Review', 'Summary']].copy()
df.head()

Unnamed: 0,Rate,Review,Summary
0,5,Super!,Great cooler.. excellent air flow and for this...
1,5,Awesome,Best budget 2 fit cooler. Nice cooling
2,3,Fair,The quality is good but the power of air is de...
3,1,Useless product,Very bad product it's a only a fan
4,3,Fair,Ok ok product


In [None]:
df.describe(include='all')

Unnamed: 0,Rate,Review,Summary
count,189873,189870,189860
unique,9,1264,98152
top,5,Nan,Good
freq,108694,20862,14175


In [None]:
# Drop rows with null values in any column
df = df.dropna()
df.describe(include='all')

Unnamed: 0,Rate,Review,Summary
count,189857,189857,189857
unique,9,1264,98149
top,5,Nan,Good
freq,108684,20852,14175


In [None]:
# Tokenize words in review and summary columns
df['review_words'] = df["Review"].apply(nltk.word_tokenize)
df['summary_words'] = df["Summary"].apply(nltk.word_tokenize)
df.head()

Unnamed: 0,Rate,Review,Summary,review_words,summary_words
0,5,Super!,Great cooler.. excellent air flow and for this...,"[Super, !]","[Great, cooler, .., excellent, air, flow, and,..."
1,5,Awesome,Best budget 2 fit cooler. Nice cooling,[Awesome],"[Best, budget, 2, fit, cooler, ., Nice, cooling]"
2,3,Fair,The quality is good but the power of air is de...,[Fair],"[The, quality, is, good, but, the, power, of, ..."
3,1,Useless product,Very bad product it's a only a fan,"[Useless, product]","[Very, bad, product, it, 's, a, only, a, fan]"
4,3,Fair,Ok ok product,[Fair],"[Ok, ok, product]"


In [None]:
stopwords = nltk.corpus.stopwords.words("english")

def fun(words):
    list = []
    for word in words:
      # Remove emojis and special characters
        if (word.isalpha()):
          # Remove common words that do not describe the review
          if (word.lower() not in stopwords):
            list.append(word)
    return list

df['summary_words'] = df['summary_words'].apply(fun)
df['review_words'] = df['review_words'].apply(fun)
df.head()

Unnamed: 0,Rate,Review,Summary,review_words,summary_words
0,5,Super!,Great cooler.. excellent air flow and for this...,[Super],"[Great, cooler, excellent, air, flow, price, a..."
1,5,Awesome,Best budget 2 fit cooler. Nice cooling,[Awesome],"[Best, budget, fit, cooler, Nice, cooling]"
2,3,Fair,The quality is good but the power of air is de...,[Fair],"[quality, good, power, air, decent]"
3,1,Useless product,Very bad product it's a only a fan,"[Useless, product]","[bad, product, fan]"
4,3,Fair,Ok ok product,[Fair],"[Ok, ok, product]"


In [None]:
# Join the words into a sentence for sentiment processing
def fun(words):
    sentence = ""
    for word in words:
        sentence += word + " "
    return sentence.strip()

df['summary_text'] = df['summary_words'].apply(fun)
df['review_text'] = df['review_words'].apply(fun)
df.head()

Unnamed: 0,Rate,Review,Summary,review_words,summary_words,summary_text,review_text
0,5,Super!,Great cooler.. excellent air flow and for this...,[Super],"[Great, cooler, excellent, air, flow, price, a...",Great cooler excellent air flow price amazing ...,Super
1,5,Awesome,Best budget 2 fit cooler. Nice cooling,[Awesome],"[Best, budget, fit, cooler, Nice, cooling]",Best budget fit cooler Nice cooling,Awesome
2,3,Fair,The quality is good but the power of air is de...,[Fair],"[quality, good, power, air, decent]",quality good power air decent,Fair
3,1,Useless product,Very bad product it's a only a fan,"[Useless, product]","[bad, product, fan]",bad product fan,Useless product
4,3,Fair,Ok ok product,[Fair],"[Ok, ok, product]",Ok ok product,Fair


In [None]:
sia = SentimentIntensityAnalyzer()

def fun(text):
    scores = sia.polarity_scores(text)
    return scores['compound']

# Compute sentiment scores for both review title and summary
df['compound_summary'] = df['summary_text'].apply(fun)
df['compound_review'] = df['review_text'].apply(fun)

# Compute the mean of the compound scores of title and summary
df['compound_mean'] = (df['compound_summary'] + df['compound_review']) / 2
df.head()

Unnamed: 0,Rate,Review,Summary,review_words,summary_words,summary_text,review_text,compound_summary,compound_review,compound_mean
0,5,Super!,Great cooler.. excellent air flow and for this...,[Super],"[Great, cooler, excellent, air, flow, price, a...",Great cooler excellent air flow price amazing ...,Super,0.9501,0.5994,0.77475
1,5,Awesome,Best budget 2 fit cooler. Nice cooling,[Awesome],"[Best, budget, fit, cooler, Nice, cooling]",Best budget fit cooler Nice cooling,Awesome,0.8591,0.6249,0.742
2,3,Fair,The quality is good but the power of air is de...,[Fair],"[quality, good, power, air, decent]",quality good power air decent,Fair,0.4404,0.3182,0.3793
3,1,Useless product,Very bad product it's a only a fan,"[Useless, product]","[bad, product, fan]",bad product fan,Useless product,-0.296,-0.4215,-0.35875
4,3,Fair,Ok ok product,[Fair],"[Ok, ok, product]",Ok ok product,Fair,0.5267,0.3182,0.42245


In [None]:
# Positive mean compound score indicates a postive review
df['inferred_positives'] = df['compound_mean'] > 0

def fun(rate):
    if(rate.isnumeric()):
      # All user ratings above or equal to 3 are considered as positive
        if(int(rate) >= 3):
            return True
        else:
            return False
    else:
        return False

df['actual_positives'] = df['Rate'].apply(fun)
df.head()

Unnamed: 0,Rate,Review,Summary,review_words,summary_words,summary_text,review_text,compound_summary,compound_review,compound_mean,inferred_positives,actual_positives
0,5,Super!,Great cooler.. excellent air flow and for this...,[Super],"[Great, cooler, excellent, air, flow, price, a...",Great cooler excellent air flow price amazing ...,Super,0.9501,0.5994,0.77475,True,True
1,5,Awesome,Best budget 2 fit cooler. Nice cooling,[Awesome],"[Best, budget, fit, cooler, Nice, cooling]",Best budget fit cooler Nice cooling,Awesome,0.8591,0.6249,0.742,True,True
2,3,Fair,The quality is good but the power of air is de...,[Fair],"[quality, good, power, air, decent]",quality good power air decent,Fair,0.4404,0.3182,0.3793,True,True
3,1,Useless product,Very bad product it's a only a fan,"[Useless, product]","[bad, product, fan]",bad product fan,Useless product,-0.296,-0.4215,-0.35875,False,False
4,3,Fair,Ok ok product,[Fair],"[Ok, ok, product]",Ok ok product,Fair,0.5267,0.3182,0.42245,True,True


In [None]:
# Calculate the accuracy of the sentiment analyzer with the review rating
actual_bool = df['actual_positives'] == True
infer_bool = df['inferred_positives'] == True
print('Actual % of Positive Reviews: ' + str((actual_bool.sum()/len(df))*100))
print('Inferred % of Positive Reviews: ' + str((infer_bool.sum()/len(df))*100))

Actual % of Positive Reviews: 86.38764965210659
Inferred % of Positive Reviews: 86.8232406495415


In [None]:
# pip install textblob

In [None]:
# from textblob import TextBlob

In [None]:
'''
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')

text_1 = "The movie was so awesome."
text_2 = "The food here tastes terrible."


#df = pd.read_excel (r'Path where the Excel file is stored\File name.xlsx')
df = pd.read_csv (r'sample_data/flipkart_product3.csv')
#df = pd.read_csv(r'sample_data/flipkart_product3.csv', encoding="utf8", errors='ignore')
# adding an row_id field to the dataframe, which will be useful for joining later
df["row_id"] = df.index + 1
#print first 10 rows
#print (df.head(10))

#create a new data frame with "id" and "comment" fields
df_subset = df[['row_id', 'Summary']].copy()

#data clean-up
#remove all non-aphabet characters
df_subset['Summary'] = df_subset['Summary'].str.replace("[^a-zA-Z#]", " ")
#covert to lower-case
df_subset['Summary'] = df_subset['Summary'].str.casefold()
print (df_subset.head(10))

# set up empty dataframe for staging output
df1=pd.DataFrame()
df1['row_id']=['99999999999']
df1['sentiment_type']='NA999NA'
df1['sentiment_score']=0

print('Processing sentiment analysis...')
sid = SentimentIntensityAnalyzer()
t_df = df1
for index, row in df_subset.iterrows():
    scores = sid.polarity_scores(row[1])
    for key, value in scores.items():
        temp = [key,value,row[0]]
        df1['row_id']=row[0]
        df1['sentiment_type']=key
        df1['sentiment_score']=value
        t_df=t_df.append(df1)
#remove dummy row with row_id = 99999999999
t_df_cleaned = t_df[t_df.row_id != '99999999999']
#remove duplicates if any exist
t_df_cleaned = t_df_cleaned.drop_duplicates()
# only keep rows where sentiment_type = compound
t_df_cleaned = t_df[t_df.sentiment_type == 'compound']
print(t_df_cleaned.head(10))

#merge dataframes
df_output = pd.merge(df, t_df_cleaned, on='row_id', how='inner')
print(df_output.head(10))


df_output[["sentiment_score"]].describe()

#generate mean of sentiment_score by period
dfg = df_output.groupby(['Rate'])['sentiment_score'].mean()
#create a bar plot
dfg.plot(kind='bar', title='Sentiment Score', ylabel='Mean Sentiment Score',
         xlabel='Rate', figsize=(6, 5))
'''

'\nimport pandas as pd\nimport nltk\nfrom nltk.sentiment.vader import SentimentIntensityAnalyzer\nnltk.download(\'vader_lexicon\')\n\ntext_1 = "The movie was so awesome."\ntext_2 = "The food here tastes terrible."\n\n\n#df = pd.read_excel (r\'Path where the Excel file is stored\\File name.xlsx\')\ndf = pd.read_csv (r\'sample_data/flipkart_product3.csv\')\n#df = pd.read_csv(r\'sample_data/flipkart_product3.csv\', encoding="utf8", errors=\'ignore\')\n# adding an row_id field to the dataframe, which will be useful for joining later\ndf["row_id"] = df.index + 1\n#print first 10 rows\n#print (df.head(10))\n\n#create a new data frame with "id" and "comment" fields\ndf_subset = df[[\'row_id\', \'Summary\']].copy()\n\n#data clean-up\n#remove all non-aphabet characters\ndf_subset[\'Summary\'] = df_subset[\'Summary\'].str.replace("[^a-zA-Z#]", " ")\n#covert to lower-case\ndf_subset[\'Summary\'] = df_subset[\'Summary\'].str.casefold()\nprint (df_subset.head(10))\n\n# set up empty dataframe for st

In [None]:
'''
import seaborn as sns
#create seaborn boxplots by group
sns.boxplot(x='Rate', y='sentiment_score', notch = True,
            data=df_output, showfliers=False).set(title='Sentiment Score by Team')
#modify axis labels
#plt.xlabel('Rate')
#plt.ylabel('Sentiment Score')
#plt.xticks(rotation=90)
'''

"\nimport seaborn as sns\n#create seaborn boxplots by group\nsns.boxplot(x='Rate', y='sentiment_score', notch = True,\n            data=df_output, showfliers=False).set(title='Sentiment Score by Team')\n#modify axis labels\n#plt.xlabel('Rate')\n#plt.ylabel('Sentiment Score')\n#plt.xticks(rotation=90)\n"