# Sentiment analysis

IMPORTS

In [1]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import pandas as pd
import numpy as np
import nltk
import re

nltk.download('averaged_perceptron_tagger') 

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/ambrose/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

LOADING THE DATASET

In [2]:
columns_to_load = ['categories', 'name','reviews.rating' , 'reviews.text']
df = pd.read_csv('1429_1.csv', usecols=columns_to_load)

print(df.dtypes)
df.head()

name               object
categories         object
reviews.rating    float64
reviews.text       object
dtype: object


  df = pd.read_csv('1429_1.csv', usecols=columns_to_load)


Unnamed: 0,name,categories,reviews.rating,reviews.text
0,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...","Electronics,iPad & Tablets,All Tablets,Fire Ta...",5.0,This product so far has not disappointed. My c...
1,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...","Electronics,iPad & Tablets,All Tablets,Fire Ta...",5.0,great for beginner or experienced person. Boug...
2,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...","Electronics,iPad & Tablets,All Tablets,Fire Ta...",5.0,Inexpensive tablet for him to use and learn on...
3,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...","Electronics,iPad & Tablets,All Tablets,Fire Ta...",4.0,I've had my Fire HD 8 two weeks now and I love...
4,"All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...","Electronics,iPad & Tablets,All Tablets,Fire Ta...",5.0,I bought this for my grand daughter when she c...


## Preprocessing

In [3]:
#Nans
print(df.isnull().sum())

#dropping rows with Nan
df = df.dropna()

#checking for duplicates
df.duplicated().sum()

#dropping rows with duplicates
df = df.drop_duplicates()


#checking results
print(df.isnull().sum())
print(df.duplicated().sum())

print(df.head())

name              6760
categories           0
reviews.rating      33
reviews.text         1
dtype: int64
name              0
categories        0
reviews.rating    0
reviews.text      0
dtype: int64
0
                                                name  \
0  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
1  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
2  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
3  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   
4  All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...   

                                          categories  reviews.rating  \
0  Electronics,iPad & Tablets,All Tablets,Fire Ta...             5.0   
1  Electronics,iPad & Tablets,All Tablets,Fire Ta...             5.0   
2  Electronics,iPad & Tablets,All Tablets,Fire Ta...             5.0   
3  Electronics,iPad & Tablets,All Tablets,Fire Ta...             4.0   
4  Electronics,iPad & Tablets,All Tablets,Fire Ta...             5.0   

                              

In [5]:
#normalizing the star rating column to later concat with the tf-idf matrix
scaler = MinMaxScaler()

# Fit and transform the 'reviews.rating' column
scaled_ratings =  scaler.fit_transform(df[['reviews.rating']])

In [6]:
rvs = df[['reviews.text']]

#remove non-alphanumeric characters and convert to lowercase
def clean_text(text):
    return (re.sub(r'[^a-zA-Z0-9 ]', '', text)).lower()

rvs['reviews.re'] = rvs['reviews.text'].apply(clean_text)

#loading stopwords
stopwords = set(stopwords.words('english'))

#removing stopwords
def remove_stopwords(text):
    return ' '.join(word for word in text.split() if word not in stopwords)

rvs['reviews.nostpwrds'] = rvs['reviews.re'].apply(remove_stopwords)

#tokenising text
def tokenize_text(text):
    return text.split()

rvs['tokens'] = rvs['reviews.nostpwrds'].apply(tokenize_text)

rvs.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rvs['reviews.re'] = rvs['reviews.text'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rvs['reviews.nostpwrds'] = rvs['reviews.re'].apply(remove_stopwords)


Unnamed: 0,reviews.text,reviews.re,reviews.nostpwrds,tokens
0,This product so far has not disappointed. My c...,this product so far has not disappointed my ch...,product far disappointed children love use lik...,"[product, far, disappointed, children, love, u..."
1,great for beginner or experienced person. Boug...,great for beginner or experienced person bough...,great beginner experienced person bought gift ...,"[great, beginner, experienced, person, bought,..."
2,Inexpensive tablet for him to use and learn on...,inexpensive tablet for him to use and learn on...,inexpensive tablet use learn step nabi thrille...,"[inexpensive, tablet, use, learn, step, nabi, ..."
3,I've had my Fire HD 8 two weeks now and I love...,ive had my fire hd 8 two weeks now and i love ...,ive fire hd 8 two weeks love tablet great valu...,"[ive, fire, hd, 8, two, weeks, love, tablet, g..."
4,I bought this for my grand daughter when she c...,i bought this for my grand daughter when she c...,bought grand daughter comes visit set user ent...,"[bought, grand, daughter, comes, visit, set, u..."


In [7]:
#initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

#function to get the POS tag for lemmatization
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADJ
    else:
        return wordnet.NOUN
    
#lemmatize tokens with POS tagging
def lemmatize_tokens(tokens):
    # POS tagging
    pos_tags = nltk.pos_tag(tokens)
    # Lemmatizing with the correct POS
    return [lemmatizer.lemmatize(token, get_wordnet_pos(pos)) for token, pos in pos_tags]

rvs['lemm.tokens'] = rvs['tokens'].apply(lemmatize_tokens)

#join lemmatized tokens back into a string for vectorization
rvs['lemm_tokens_joined'] = rvs['lemm.tokens'].apply(lambda tokens: ' '.join(tokens))

rvs.head()

Unnamed: 0,reviews.text,reviews.re,reviews.nostpwrds,tokens,lemm.tokens,lemm_tokens_joined
0,This product so far has not disappointed. My c...,this product so far has not disappointed my ch...,product far disappointed children love use lik...,"[product, far, disappointed, children, love, u...","[product, far, disappointed, child, love, use,...",product far disappointed child love use like a...
1,great for beginner or experienced person. Boug...,great for beginner or experienced person bough...,great beginner experienced person bought gift ...,"[great, beginner, experienced, person, bought,...","[great, beginner, experience, person, buy, gif...",great beginner experience person buy gift love
2,Inexpensive tablet for him to use and learn on...,inexpensive tablet for him to use and learn on...,inexpensive tablet use learn step nabi thrille...,"[inexpensive, tablet, use, learn, step, nabi, ...","[inexpensive, tablet, use, learn, step, nabi, ...",inexpensive tablet use learn step nabi thrill ...
3,I've had my Fire HD 8 two weeks now and I love...,ive had my fire hd 8 two weeks now and i love ...,ive fire hd 8 two weeks love tablet great valu...,"[ive, fire, hd, 8, two, weeks, love, tablet, g...","[ive, fire, hd, 8, two, week, love, tablet, gr...",ive fire hd 8 two week love tablet great value...
4,I bought this for my grand daughter when she c...,i bought this for my grand daughter when she c...,bought grand daughter comes visit set user ent...,"[bought, grand, daughter, comes, visit, set, u...","[buy, grand, daughter, come, visit, set, user,...",buy grand daughter come visit set user enter a...


## TRAINING A MODEL ON HALF THE DATASET

### Creating the alternative dataset

In [8]:
#preparing the labels

def sentiment_labels(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    else:
        return 'negative'
    

#I am going to pick 50% of the dataset for training and then use the trained model
# to predict the sentiment of the whole dataset.

#creating alternative dataset to split
training_data = pd.DataFrame()
training_data['lemm_tokens_joined'] = rvs['lemm_tokens_joined']
training_data['rating'] = df['reviews.rating']
training_data['label'] = df['reviews.rating']

#splitting the dataset in half
training_data = training_data.sample(frac=0.5, random_state=42)

#normalizing the star rating column to later concat it with the tf-idf matrix
scaled_ratings_train =  scaler.fit_transform(training_data[['label']])

#defining X
training_data['label'] = training_data['label'].apply(sentiment_labels)

training_data.head(30)

Unnamed: 0,lemm_tokens_joined,rating,label
6135,love reading kindle camera great,5.0,positive
11238,search internet great bonus device,5.0,positive
5578,light weight user friendly tablet everything e...,5.0,positive
25164,alexa pretty awesome sirius xm stream howard s...,5.0,positive
14348,perfect kid play read online wifi touch respon...,4.0,positive
26711,item useless play music voice recognition unle...,1.0,negative
16262,get 2 year old granddaughter christmas play ti...,5.0,positive
27853,good product,5.0,positive
26435,great product convenient easy set connect hue ...,5.0,positive
312,bought dad late 60 mainly listen music read eb...,4.0,positive


TF-IDF

In [9]:
#initializing tfidf
tfidf_vectorizer = TfidfVectorizer(min_df = 5,max_df =0.8, max_features=5000,ngram_range=(2,3))

tfidf_matrix_train = tfidf_vectorizer.fit_transform(training_data['lemm_tokens_joined'])

#combining the TF-IDF matrix and scaled star ratings
combined_features_train = np.hstack([tfidf_matrix_train.toarray(), scaled_ratings_train])

### Train and test split

In [10]:
X_train, X_test, y_train, y_test = train_test_split( combined_features_train, training_data[['label']], test_size=0.3, random_state=42)
print ('Train set:', X_train.shape,  y_train.shape)
print ('Test set:', X_test.shape,  y_test.shape)

Train set: (9753, 5001) (9753, 1)
Test set: (4181, 5001) (4181, 1)


Applying SMOTE

In [11]:
# Applying SMOTE to handle imbalance
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

#print("Before SMOTE:", X_train.shape, y_train.value_counts())
#print("After SMOTE:", X_resampled.shape, pd.Series(y_resampled).value_counts())



### Training and testing with Logistic Regression

In [12]:
#initializing Logistic Regression
logreg = LogisticRegression(
    solver='liblinear',
    penalty='l2',
    C=1.0,
    random_state=42
)

#Fitting the model
logreg.fit(X_resampled, y_resampled)

#Predicting on the test set
y_pred = logreg.predict(X_test)

  y = column_or_1d(y, warn=True)


### Evaluation

In [13]:
# Accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 1.00
Classification Report:
              precision    recall  f1-score   support

    negative       0.98      0.99      0.98        89
     neutral       0.98      0.97      0.98       186
    positive       1.00      1.00      1.00      3906

    accuracy                           1.00      4181
   macro avg       0.99      0.99      0.99      4181
weighted avg       1.00      1.00      1.00      4181

Confusion Matrix:
[[  88    1    0]
 [   2  181    3]
 [   0    2 3904]]


## PREDICTING ON THE WHOLE DATASET

TF-IDF

In [14]:
#applyinf TF-IDF
tfidf_matrix = tfidf_vectorizer.fit_transform(rvs['lemm_tokens_joined'])

#combining the TF-IDF matrix and scaled star ratings
combined_features = np.hstack([tfidf_matrix.toarray(), scaled_ratings])

Creating a clean dataframe to export

In [21]:
#initializing dataframe
reviews_sentiment = pd.DataFrame()

#adding columns
reviews_sentiment[['category', 'product_names', 'review', 'star_rating']] = df[['categories', 'name', 'reviews.text', 'reviews.rating']]

reviews_sentiment.head()

Unnamed: 0,category,product_names,review,star_rating
0,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",This product so far has not disappointed. My c...,5.0
1,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",great for beginner or experienced person. Boug...,5.0
2,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",Inexpensive tablet for him to use and learn on...,5.0
3,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",I've had my Fire HD 8 two weeks now and I love...,4.0
4,"Electronics,iPad & Tablets,All Tablets,Fire Ta...","All-New Fire HD 8 Tablet, 8 HD Display, Wi-Fi,...",I bought this for my grand daughter when she c...,5.0


Predicting with the trained Logistic Regression Model

In [22]:
#Predicting on the whole dataset
reviews_sentiment['sentiment'] = logreg.predict(combined_features)

Export CSV

In [23]:
reviews_sentiment.to_csv('reviews_sentiment.csv', index=False)
