#Amazon Mobile Reviews Sentimental analysis

In [127]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Import Libraries

In [128]:
!pip install emoji
!pip install missingpy



In [0]:
import json
import os
import pandas as pd
import re
import numpy as np
import spacy
nlp = spacy.load("en_core_web_sm")
from nltk.tokenize import word_tokenize
import emoji

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

#install wordcloud
from wordcloud import WordCloud

import matplotlib.pyplot as plt
%matplotlib inline

#Train test split
from sklearn.model_selection import train_test_split

#Tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

#Models
from sklearn import model_selection, naive_bayes, svm
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, recall_score, precision_score,confusion_matrix,f1_score

#Impute
from sklearn.preprocessing import Imputer
from missingpy import KNNImputer
from sklearn.impute import SimpleImputer

#datetime
import datetime

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

from xgboost.sklearn import XGBClassifier
#from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold

In [0]:
df=pd.read_csv('/content/drive/My Drive/Colab Notebooks/NLP/sentimental_Analysis/Amazon_Unlocked_Mobile.csv')

In [131]:
df.shape

(413840, 6)

In [132]:
df.head()

Unnamed: 0,Product Name,Brand Name,Price,Rating,Reviews,Review Votes
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0


In [133]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 413840 entries, 0 to 413839
Data columns (total 6 columns):
Product Name    413840 non-null object
Brand Name      348669 non-null object
Price           407907 non-null float64
Rating          413840 non-null int64
Reviews         413778 non-null object
Review Votes    401544 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 18.9+ MB


In [134]:
df.isnull().sum()

Product Name        0
Brand Name      65171
Price            5933
Rating              0
Reviews            62
Review Votes    12296
dtype: int64

In [0]:
df.columns = df.columns.str.replace(' ', '')

In [136]:
df.columns

Index(['ProductName', 'BrandName', 'Price', 'Rating', 'Reviews',
       'ReviewVotes'],
      dtype='object')

In [0]:
df=df.dropna()

In [138]:
df.shape

(334335, 6)

#Removing the duplicates

In [139]:
print("Length before duplicates",len(df))
df.drop_duplicates(keep = False, inplace = True) 
print("Length before duplicates",len(df))

Length before duplicates 334335
Length before duplicates 234445


In [0]:
# Importing HTMLParser
from html.parser import HTMLParser
html_parser = HTMLParser()

In [141]:
# Created a new columns i.e. clean_reviews contains the same review but cleaned version
df['clean_review'] = df['Reviews'].apply(lambda x: html_parser.unescape(x))
df.head()

Unnamed: 0,ProductName,BrandName,Price,Rating,Reviews,ReviewVotes,clean_review
0,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,I feel so LUCKY to have found this used (phone...,1.0,I feel so LUCKY to have found this used (phone...
1,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,"nice phone, nice up grade from my pantach revu...",0.0,"nice phone, nice up grade from my pantach revu..."
2,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,5,Very pleased,0.0,Very pleased
3,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,It works good but it goes slow sometimes but i...,0.0,It works good but it goes slow sometimes but i...
4,"""CLEAR CLEAN ESN"" Sprint EPIC 4G Galaxy SPH-D7...",Samsung,199.99,4,Great phone to replace my lost phone. The only...,0.0,Great phone to replace my lost phone. The only...


#Handling Emojis

In [0]:
emoticon_dict = {
":)": "happy",
":‑)": "happy",
":-]": "happy",
":-3": "happy",
":->": "happy",
"8-)": "happy",
":-}": "happy",
":o)": "happy",
":c)": "happy",
":^)": "happy",
"=]": "happy",
"=)": "happy",
"<3": "happy",
":-(": "sad",
":(": "sad",
":c": "sad",
":<": "sad",
":[": "sad",
">:[": "sad",
":{": "sad",
">:(": "sad",
":-c": "sad",
":-< ": "sad",
":-[": "sad",
":-||": "sad"
}

In [0]:
def lookup_dict(text,dictionary):
  for word in text.split():
    if word.lower() in dictionary:
      if word.lower()in text.split():
        text=text.replace(word,dictionary[word.lower()])
  return text

In [0]:
df['clean_review']=df['clean_review'].apply(lambda x: lookup_dict(x,emoticon_dict))

In [145]:
df.shape

(234445, 7)

#Removing @username or any Mobile brand from the reviews

In [0]:
def remove_pattern(input_text,pattern):
  r=re.findall(pattern,input_text)
  for i in r:
    input_text=re.sub(i, '',input_text)
  return input_text

In [0]:
#remove @user
df['clean_review']=df['clean_review'].apply(lambda x: remove_pattern(x,"@[\w]*"))

In [148]:
df.columns

Index(['ProductName', 'BrandName', 'Price', 'Rating', 'Reviews', 'ReviewVotes',
       'clean_review'],
      dtype='object')

#Remove unwanted columns

In [0]:
df=df.drop(['ProductName','BrandName','Price','Reviews','ReviewVotes'],axis=1)

In [150]:
df.head()

Unnamed: 0,Rating,clean_review
0,5,I feel so LUCKY to have found this used (phone...
1,4,"nice phone, nice up grade from my pantach revu..."
2,5,Very pleased
3,4,It works good but it goes slow sometimes but i...
4,4,Great phone to replace my lost phone. The only...


#Removing $ticker from all the reviews

In [0]:
df['clean_review']=df['clean_review'].apply(lambda x:re.sub(r"\$[\w]*", "",x))

#Removing "URL",Punctuation,special characters,numbers

In [0]:
df['clean_review']=df['clean_review'].apply(lambda x: re.sub(r'http\S*|www.\S*', ' ',x)) #Replacing url links with space
df['clean_review']=df['clean_review'].apply(lambda x: re.sub(r'[^a-zA-Z]',' ',x)) #Replacing special characters with space
df['clean_review']=df['clean_review'].apply(lambda x: re.sub(r'[^\w\s]',' ',x))       ## ReplacingPunctuations with space \w matches all ascii characters \s white space characters

In [153]:
df.head()

Unnamed: 0,Rating,clean_review
0,5,I feel so LUCKY to have found this used phone...
1,4,nice phone nice up grade from my pantach revu...
2,5,Very pleased
3,4,It works good but it goes slow sometimes but i...
4,4,Great phone to replace my lost phone The only...


#converting to lower case 

In [154]:
df['clean_review']=df['clean_review'].apply(lambda x: x.lower())
df.head()

Unnamed: 0,Rating,clean_review
0,5,i feel so lucky to have found this used phone...
1,4,nice phone nice up grade from my pantach revu...
2,5,very pleased
3,4,it works good but it goes slow sometimes but i...
4,4,great phone to replace my lost phone the only...


In [155]:
df.dtypes

Rating           int64
clean_review    object
dtype: object

#Converting Rating data type to category

In [0]:
#df['Rating']=df['Rating'].astype('category')

In [157]:
df.dtypes

Rating           int64
clean_review    object
dtype: object

#Handling Contractions and short words


In [0]:
# Handling contractions
contractions = {
"ain't": "is not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "I am",
"I've": "I have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"who's": "who is",
"who've": "who have",
"why's": "why is",
"why've": "why have",
"will've": "will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have"
}

In [0]:
short_word_dict = {
"121": "one to one",
"a/s/l": "age, sex, location",
"adn": "any day now",
"afaik": "as far as I know",
"afk": "away from keyboard",
"aight": "alright",
"alol": "actually laughing out loud",
"b4": "before",
"b4n": "bye for now",
"bak": "back at the keyboard",
"bf": "boyfriend",
"bff": "best friends forever",
"bfn": "bye for now",
"bg": "big grin",
"bta": "but then again",
"btw": "by the way",
"cid": "crying in disgrace",
"cnp": "continued in my next post",
"cp": "chat post",
"cu": "see you",
"cul": "see you later",
"cul8r": "see you later",
"cya": "bye",
"cyo": "see you online",
"dbau": "doing business as usual",
"fud": "fear, uncertainty, and doubt",
"fwiw": "for what it's worth",
"fyi": "for your information",
"g": "grin",
"g2g": "got to go",
"ga": "go ahead",
"gal": "get a life",
"gf": "girlfriend",
"gfn": "gone for now",
"gmbo": "giggling my butt off",
"gmta": "great minds think alike",
"h8": "hate",
"hagn": "have a good night",
"hdop": "help delete online predators",
"hhis": "hanging head in shame",
"iac": "in any case",
"ianal": "I am not a lawyer",
"ic": "I see",
"idk": "I don't know",
"imao": "in my arrogant opinion",
"imnsho": "in my not so humble opinion",
"imo": "in my opinion",
"iow": "in other words",
"ipn": "I’m posting naked",
"irl": "in real life",
"jk": "just kidding",
"l8r": "later",
"ld": "later, dude",
"ldr": "long distance relationship",
"llta": "lots and lots of thunderous applause",
"lmao": "laugh my ass off",
"lmirl": "let's meet in real life",
"lol": "laugh out loud",
"ltr": "longterm relationship",
"lulab": "love you like a brother",
"lulas": "love you like a sister",
"luv": "love",
"m/f": "male or female",
"m8": "mate",
"milf": "mother I would like to fuck",
"oll": "online love",
"omg": "oh my god",
"otoh": "on the other hand",
"pir": "parent in room",
"ppl": "people",
"r": "are",
"rofl": "roll on the floor laughing",
"rpg": "role playing games",
"ru": "are you",
"shid": "slaps head in disgust",
"somy": "sick of me yet",
"sot": "short of time",
"thanx": "thanks",
"thx": "thanks",
"ttyl": "talk to you later",
"u": "you",
"ur": "you are",
"uw": "you’re welcome",
"wb": "welcome back",
"wfm": "works for me",
"wibni": "wouldn't it be nice if",
"wtf": "what the fuck",
"wtg": "way to go",
"wtgp": "want to go private",
"ym": "young man",
"gr8": "great"
}

In [160]:
df['clean_review']=df['clean_review'].apply(lambda x:lookup_dict(x,contractions))
df['clean_review']=df['clean_review'].apply(lambda x:lookup_dict(x,short_word_dict))
df.head()

Unnamed: 0,Rating,clean_review
0,5,i feel so lucky to have found this used phone...
1,4,nice phone nice up grade from my pantach revu...
2,5,very pleased
3,4,it works good but it goes slow sometimes but i...
4,4,great phone to replace my lost phone the only...


#Stopwords,Stemming,lemmatization

In [0]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [162]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [163]:
df['clean_review']=df['clean_review'].apply(lambda x :word_tokenize(x))
df.head()

Unnamed: 0,Rating,clean_review
0,5,"[i, feel, so, lucky, to, have, found, this, us..."
1,4,"[nice, phone, nice, up, grade, from, my, panta..."
2,5,"[very, pleased]"
3,4,"[it, works, good, but, it, goes, slow, sometim..."
4,4,"[great, phone, to, replace, my, lost, phone, t..."


In [164]:
#Importing library for lemmatization
from nltk.stem.wordnet import WordNetLemmatizer
lemmatizing=WordNetLemmatizer()
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [165]:
df['clean_review']=df['clean_review'].apply(lambda x: ' '.join([lemmatizing.lemmatize(i) for i in x]))
df.head()

Unnamed: 0,Rating,clean_review
0,5,i feel so lucky to have found this used phone ...
1,4,nice phone nice up grade from my pantach revue...
2,5,very pleased
3,4,it work good but it go slow sometimes but it a...
4,4,great phone to replace my lost phone the only ...


#Rating considered as positive if Rating > 3 else negative.

In [166]:
df['new_rating']=np.where(df['Rating'] >3,1,0)
df.head()

Unnamed: 0,Rating,clean_review,new_rating
0,5,i feel so lucky to have found this used phone ...,1
1,4,nice phone nice up grade from my pantach revue...,1
2,5,very pleased,1
3,4,it work good but it go slow sometimes but it a...,1
4,4,great phone to replace my lost phone the only ...,1


In [167]:
df['new_rating'].unique()

array([1, 0])

In [168]:
df=df.drop(['Rating'],axis=1)
df.head()

Unnamed: 0,clean_review,new_rating
0,i feel so lucky to have found this used phone ...,1
1,nice phone nice up grade from my pantach revue...,1
2,very pleased,1
3,it work good but it go slow sometimes but it a...,1
4,great phone to replace my lost phone the only ...,1


In [0]:
df.to_csv(r'/content/drive/My Drive/Colab Notebooks/NLP/sentimental_Analysis/Amazon_Unlocked_Mobile_cleaned.csv')