In [5]:
from warnings import filterwarnings
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from PIL import Image
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate
from sklearn.preprocessing import LabelEncoder
from textblob import Word, TextBlob
from wordcloud import WordCloud
import nltk
filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 200)
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [4]:
import pandas as pd
import ast

df = pd.read_csv("nlp/datasets/amazon_reviews_preprocessed.csv")

df['reviewText'] = df['reviewText'].apply(lambda x: " ".join(ast.literal_eval(x)) if isinstance(x, str) else x)

df['reviewText'].head()

0                                                issue
1    purchased device worked advertised never much ...
2    work expected higher capacity think made bit e...
3    think worked gb card went south one held prett...
4    bought retail packaging arrived legit envelope...
Name: reviewText, dtype: object

In [6]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\ramiz\AppData\Roaming\nltk_data...


True

In [10]:
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("I love this product!")

{'neg': 0.0, 'neu': 0.308, 'pos': 0.692, 'compound': 0.6696}

In [11]:
sia.polarity_scores("I hate this product!")

{'neg': 0.666, 'neu': 0.334, 'pos': 0.0, 'compound': -0.6114}

In [12]:
df['polarity_scores'] = df['reviewText'].apply(lambda x: sia.polarity_scores(x)['compound'])

In [14]:
df[['reviewText', 'overall', 'polarity_scores']]

Unnamed: 0,reviewText,overall,polarity_scores
0,issue,4.00,0.00
1,purchased device worked advertised never much ...,5.00,0.00
2,work expected higher capacity think made bit e...,4.00,0.40
3,think worked gb card went south one held prett...,5.00,0.65
4,bought retail packaging arrived legit envelope...,5.00,0.86
...,...,...,...
4910,bought sandisk gb class use htc inspire month ...,1.00,0.08
4911,used capability samsung galaxy note greatly ex...,5.00,0.18
4912,great card fast reliable come optional adapter...,5.00,0.85
4913,good amount space stuff want fit gopro say,5.00,0.69


### Feature Engineering

In [17]:
df['sentiment_label'] = df['reviewText'].apply(lambda x: 'pos' if sia.polarity_scores(x)['compound'] > 0 else 'neg')

In [18]:
df[['reviewText', 'polarity_scores', 'sentiment_label']]

Unnamed: 0,reviewText,polarity_scores,sentiment_label
0,issue,0.00,neg
1,purchased device worked advertised never much ...,0.00,neg
2,work expected higher capacity think made bit e...,0.40,pos
3,think worked gb card went south one held prett...,0.65,pos
4,bought retail packaging arrived legit envelope...,0.86,pos
...,...,...,...
4910,bought sandisk gb class use htc inspire month ...,0.08,pos
4911,used capability samsung galaxy note greatly ex...,0.18,pos
4912,great card fast reliable come optional adapter...,0.85,pos
4913,good amount space stuff want fit gopro say,0.69,pos


In [19]:
df['sentiment_label'].value_counts()

sentiment_label
pos    3944
neg     971
Name: count, dtype: int64

In [20]:
df.groupby('sentiment_label')['overall'].mean()

sentiment_label
neg   4.09
pos   4.71
Name: overall, dtype: float64

### Count vectors

In [21]:
df['sentiment_label'] = LabelEncoder().fit_transform(df['sentiment_label'])

In [22]:
y = df['sentiment_label']
X = df['reviewText']

In [23]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_count = vectorizer.fit_transform(X)

In [26]:
vectorizer.get_feature_names_out()[0:10]

array(['ability', 'able', 'absolute', 'absolutely', 'abuse', 'accept',
       'acceptable', 'accepted', 'accepting', 'accepts'], dtype=object)

### TF-IDF

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf_idf_word_vectorizer = TfidfVectorizer()
X_tf_idf_word = tf_idf_word_vectorizer.fit_transform(X)

### Sentiment Modelling

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    X_tf_idf_word, y, test_size=0.2, random_state=42, stratify=y
)

In [40]:
log_reg = LogisticRegression().fit(X_train, y_train)
cross_val_score(log_reg, X_tf_idf_word, y, cv=5, scoring='accuracy').mean()

np.float64(0.8299084435401831)

In [41]:
new_review = ["This product is great! I love it."]
new_review_vectorized = TfidfVectorizer().fit(X).transform(new_review)

In [42]:
log_reg.predict(new_review_vectorized)

array([1])

In [43]:
random_review = pd.Series(df['reviewText'].sample(1).values)
random_review

0    easy install good performance used phone photo...
dtype: object

In [44]:
log_reg.predict(TfidfVectorizer().fit(X).transform(random_review))

array([1])

### Random Forest

In [45]:
rf_model = RandomForestClassifier(random_state=17)

rf_params = {
    "max_depth": [5, 8, None],
    "n_estimators": [100, 200],
    "min_samples_split": [2, 5, 8],
    "max_features": [3, 5, 7]
}

rf_best_grid = GridSearchCV(rf_model, rf_params, cv=5, n_jobs=-1, verbose=True).fit(X_train, y_train)

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [46]:
rf_best_grid.best_params_

{'max_depth': None,
 'max_features': 7,
 'min_samples_split': 2,
 'n_estimators': 100}

In [47]:
rf_final = RandomForestClassifier(**rf_best_grid.best_params_).fit(X_train, y_train)

In [50]:
cross_val_score(rf_final, X_tf_idf_word, y, cv=5, scoring='accuracy').mean()

np.float64(0.8115971515768058)