In [222]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [223]:
cd '/content/drive/MyDrive/Interview/ZenDutyMLChallenge'

/content/drive/MyDrive/Interview/ZenDutyMLChallenge



> Task 1: Sentiment Analysis with NLP

> Subtasks
*   Preprocess the review texts for NLP analysis (tokenization, stemming, removing stop words)
*   Implement sentiment analysis to classify reviews into categories such as positive, neutral, and negative.
*   Evaluate the sentiment analysis model using accuracy, precision, recall, and F1-score.






**Subtask 1**

In [224]:
# Sub Task 1. Import the necessary libraries.
import pandas as pd
import nltk
from nltk import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [225]:
# Load the dataset
df = pd.read_csv('zen_duty_challenge/review_data.csv')

In [226]:
# Function for preprocessing the text
def preprocess_text(txt):
  tokens = word_tokenize(txt)
  stop_words = stopwords.words('english')
  tokens = [token for token in tokens if token not in stop_words]
  stemmer = PorterStemmer()
  stemmed = [stemmer.stem(token) for token in tokens]
  return stemmed

In [227]:
# Apply it to dataframe
df['preprocessed_reviews'] = df['Review'].apply(preprocess_text)

In [228]:
# It seems that some stopwords are still there. So getting a list of all unique words, and reiterating the process.
unique_words = []
for review in df['preprocessed_reviews'].values:
  unique_words.extend(review)
unique_words = set(unique_words)

In [229]:
# Since the list is small, making a list of left_over stopwords manually.
left_over_stopwords = ["'m",",",".","i","the"]
def remove_left_over_stopwords(tokens):
  tokens = [token for token in tokens if token not in left_over_stopwords]
  return tokens

In [230]:
# Apply it to dataframe
df['preprocessed_reviews'] = df['preprocessed_reviews'].apply(remove_left_over_stopwords)

In [231]:
df['preprocessed_reviews']

0       [batteri, life, good, camera, qualiti, quit, n...
1                                [overal, product, excel]
2       [batteri, life, disappoint, screen, disappoint...
3       [screen, satisfactori, perform, quit, nice, pr...
4         [screen, terribl, price, worst, higher, expect]
                              ...                        
9995    [batteri, life, perfect, camera, qualiti, perf...
9996                       [screen, okay, higher, expect]
9997    [camera, qualiti, outstand, screen, excel, per...
9998    [batteri, life, bad, camera, qualiti, bad, per...
9999    [batteri, life, satisfactori, camera, qualiti,...
Name: preprocessed_reviews, Length: 10000, dtype: object

**Subtask 2 & 3**:
given that we have reviews. We can treat these tasks as supervised learning. But for that we would need to assign labels to current ratings.

In [232]:
# Import the liraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.linear_model import LogisticRegression

In [233]:
# Preparing the sentiments
def extract_sentiment_from_ratings(rating):
  if rating <= 2:
    return 'Negative'
  elif rating == 3:
    return 'Neutral'
  else:
    return 'Positive'

In [234]:
df['sentiment'] = df['Rating'].apply(extract_sentiment_from_ratings)

In [241]:
df['sentiment'].head()

0    Positive
1    Positive
2    Negative
3    Positive
4    Negative
Name: sentiment, dtype: object

In [235]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(df['preprocessed_reviews'].astype(str), df['sentiment'], test_size=0.2, random_state=42)

In [236]:
# Text Vectorizer
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.fit_transform(X_test)

In [237]:
# Train the model. Let's try with simple logistic regression first, later optimize.
model = LogisticRegression(random_state=0).fit(X_train_tfidf,y_train)

In [238]:
# Writing scripts for evaluation
predictions = model.predict(X_test_tfidf)

# Evaluating on the different metrics
accuracy = accuracy_score(y_test,predictions)
# For each class one will get a different precision, recall and f1_score.
precision,recall,f1_score, support = precision_recall_fscore_support(y_test,predictions)

In [240]:
print('Accuracy',accuracy)
print('Precision',precision)
print('Recall',recall)
print('F1-score',f1_score)

Accuracy 1.0
Precision [1. 1. 1.]
Recall [1. 1. 1.]
F1-score [1. 1. 1.]


> Task 2: Correlation Analysis between Sentiment Indicators and Ratings. Investigate the correlation between the presence of specific sentiment indicators in the reviews (e.g., mentions of "satisfied" "quality,", etc.) and the product ratings.

> Subtasks
*   Identify key sentiment indicators and use NLP to detect their presence in each review.
*   Create binary features indicating the presence or absence of each sentiment indicator.
*   Perform correlation analysis to determine which sentiments are strongly associated with higher or lower ratings.


**Subtasks 1&2**

In [242]:
# Import the necessary libraries
import numpy as np

In [243]:
# First let's get a list of sentiments. In task 1 we stored it under unique words.
different_sentiments = ['averag','bad','disappoint','dread','excel','good','higher','mediocr',
                        'nice','okay','outstand','perfect','poor','satisfactori','satisfi','terribl','worst']
def detect_average(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'averag':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_bad(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'bad':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_disappointing(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'disappoint':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_dread(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'dread':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_excel(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'excel':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_good(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'good':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_higher(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'higher':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_mediocre(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'mediocr':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_nice(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'nice':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_okay(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'okay':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_outstanding(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'outstand':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_perfect(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'perfect':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_poor(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'poor':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_satisfactory(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'satisfactori':
      sentiment_present = True
    elif token == 'satisfi':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_terrible(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'terribl':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_worst(tokens):
  sentiment_present = False
  for token in tokens:
    if token == 'worst':
      sentiment_present = True
  return 1 if sentiment_present == True else 0

In [244]:
df['average'] = df['preprocessed_reviews'].apply(detect_average)
df['bad'] = df['preprocessed_reviews'].apply(detect_bad)
df['disappointing'] = df['preprocessed_reviews'].apply(detect_disappointing)
df['dread'] = df['preprocessed_reviews'].apply(detect_dread)
df['excel'] = df['preprocessed_reviews'].apply(detect_excel)
df['good'] = df['preprocessed_reviews'].apply(detect_good)
df['higher'] = df['preprocessed_reviews'].apply(detect_higher)
df['mediocre'] = df['preprocessed_reviews'].apply(detect_mediocre)
df['nice'] = df['preprocessed_reviews'].apply(detect_nice)
df['okay'] = df['preprocessed_reviews'].apply(detect_okay)
df['outstanding'] = df['preprocessed_reviews'].apply(detect_outstanding)
df['perfect'] = df['preprocessed_reviews'].apply(detect_perfect)
df['poor'] = df['preprocessed_reviews'].apply(detect_poor)
df['satisfactory'] = df['preprocessed_reviews'].apply(detect_satisfactory)
df['terrible'] = df['preprocessed_reviews'].apply(detect_terrible)
df['worst'] = df['preprocessed_reviews'].apply(detect_worst)

**Subtask 3**

In [246]:
# For corellation make a birary list of positive, negative and neutral. This will help to identify which is associated with higher or lower rating.
def detect_negative(token):
  sentiment_present = False
  if token == 'Negative':
    sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_positive(token):
  sentiment_present = False
  if token == 'Positive':
    sentiment_present = True
  return 1 if sentiment_present == True else 0

def detect_neutral(token):
  sentiment_present = False
  if token == 'Neutral':
    sentiment_present = True
  return 1 if sentiment_present == True else 0

In [247]:
df['negative'] = df['sentiment'].apply(detect_negative)
df['positive'] = df['sentiment'].apply(detect_positive)
df['neutral'] = df['sentiment'].apply(detect_neutral)

In [248]:
df_corr = pd.DataFrame()
df_corr['average'] = df['average']
df_corr['bad'] = df['bad']
df_corr['disappointing'] = df['disappointing']
df_corr['dread'] = df['dread']
df_corr['excel'] = df['excel']
df_corr['good'] = df['good']
df_corr['higher'] = df['higher']
df_corr['mediocre'] = df['mediocre']
df_corr['nice'] = df['nice']
df_corr['okay'] = df['okay']
df_corr['outstanding'] = df['outstanding']
df_corr['perfect'] = df['perfect']
df_corr['poor'] = df['poor']
df_corr['satisfactory'] = df['satisfactory']
df_corr['terrible'] = df['terrible']
df_corr['worst'] = df['worst']
df_corr['negative'] = df['negative']
df_corr['positive'] = df['positive']
df_corr['neutral'] = df['neutral']

In [250]:
# Giving a sample of the features, all are in binary format.
df_corr.head()

Unnamed: 0,average,bad,disappointing,dread,excel,good,higher,mediocre,nice,okay,outstanding,perfect,poor,satisfactory,terrible,worst,negative,positive,neutral
0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0
1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0


In [251]:
df_corr.corr()

Unnamed: 0,average,bad,disappointing,dread,excel,good,higher,mediocre,nice,okay,outstanding,perfect,poor,satisfactory,terrible,worst,negative,positive,neutral
average,1.0,-0.132463,-0.136863,-0.108438,-0.168852,-0.197697,0.432365,0.454478,-0.197077,0.484463,-0.164955,-0.167534,-0.132714,-0.424411,-0.113384,-0.106882,-0.245438,-0.43533,0.746859
bad,-0.132463,1.0,0.507834,-0.077419,-0.120551,-0.141144,0.31186,-0.129287,-0.140702,-0.131307,-0.117769,-0.11961,0.504802,-0.303006,-0.080949,-0.076308,0.539699,-0.310801,-0.17736
disappointing,-0.136863,0.507834,1.0,-0.07999,-0.124555,-0.145833,0.318526,-0.133582,-0.145376,-0.135668,-0.121681,-0.123583,0.494386,-0.31307,-0.083638,-0.078842,0.557626,-0.321125,-0.183251
dread,-0.108438,-0.077419,-0.07999,1.0,-0.098687,-0.115545,0.252414,-0.105838,-0.115183,-0.107492,-0.096409,-0.097916,-0.077566,-0.24805,0.518839,0.501346,0.441814,-0.254431,-0.145192
excel,-0.168852,-0.120551,-0.124555,-0.098687,1.0,-0.179919,-0.37708,-0.164805,-0.179356,-0.167379,0.467593,0.499133,-0.12078,0.387255,-0.103188,-0.097271,-0.223367,0.387872,-0.226083
good,-0.197697,-0.141144,-0.145833,-0.115545,-0.179919,1.0,-0.441495,-0.192957,0.414004,-0.195972,-0.175767,-0.178514,-0.141412,0.449458,-0.120815,-0.113887,-0.261524,0.454131,-0.264704
higher,0.432365,0.31186,0.318526,0.252414,-0.37708,-0.441495,1.0,0.426971,-0.440112,0.430581,-0.368377,-0.374135,0.311057,-0.947792,0.267145,0.248538,0.559324,-0.972175,0.56719
mediocre,0.454478,-0.129287,-0.133582,-0.105838,-0.164805,-0.192957,0.426971,1.0,-0.192353,0.469985,-0.161001,-0.163518,-0.129533,-0.414237,-0.110665,-0.104319,-0.239554,-0.424894,0.728955
nice,-0.197077,-0.140702,-0.145376,-0.115183,-0.179356,0.414004,-0.440112,-0.192353,1.0,-0.195358,-0.175216,-0.177955,-0.140969,0.447965,-0.120436,-0.11353,-0.260705,0.452708,-0.263875
okay,0.484463,-0.131307,-0.135668,-0.107492,-0.167379,-0.195972,0.430581,0.469985,-0.195358,1.0,-0.163516,-0.166072,-0.131556,-0.420708,-0.112394,-0.105949,-0.243296,-0.431531,0.740342


Highest Corellation Pairs

*   Average - Neutral
*   Bad - Negative
*   Satisfactory - Positive








In [252]:
# Save the corellation matrix for future use
df_corr.to_csv('corellation_matrix.csv')

> Task 3: Predicting Ratings Using Machine Learning

> Subtasks
*   Leverage features from sentiment analysis and the presence of sentiment indicators to build a predictive model for product ratings.
*   Split the dataset into training and testing sets to validate the model's performance.
*   Assess the model's accuracy and explore areas for improvement.

Use the findings from sentiment and correlation analysis to identify key factors influencing customer satisfaction.


**Subtask 1,2 & 3.**

  

*   Preparing a new dataframe for these tasks.
*   In the corellation question I had already made the sentiments into binary features, So would be using those.
*   Will also be making use of presence or absence of sentiments.
*   All features are in binary format already
*   Making use of linear regression first and then based on time other models.
*   For evaluation making use of mean squared error













In [253]:
# Import necessary libraries
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [254]:
# Prepare the dataframe
df_ratings_pred = pd.DataFrame()
df_ratings_pred['average'] = df_corr['average']
df_ratings_pred['bad'] = df_corr['bad']
df_ratings_pred['disappointing'] = df_corr['disappointing']
df_ratings_pred['dread'] = df_corr['dread']
df_ratings_pred['excel'] = df_corr['excel']
df_ratings_pred['good'] = df_corr['good']
df_ratings_pred['higher'] = df_corr['higher']
df_ratings_pred['mediocre'] = df_corr['mediocre']
df_ratings_pred['nice'] = df_corr['nice']
df_ratings_pred['okay'] = df_corr['okay']
df_ratings_pred['outstanding'] = df_corr['outstanding']
df_ratings_pred['perfect'] = df_corr['perfect']
df_ratings_pred['poor'] = df_corr['poor']
df_ratings_pred['satisfactory'] = df_corr['satisfactory']
df_ratings_pred['terrible'] = df_corr['terrible']
df_ratings_pred['worst'] = df_corr['worst']
df_ratings_pred['negative'] = df_corr['negative']
df_ratings_pred['positive'] = df_corr['positive']
df_ratings_pred['neutral'] = df_corr['neutral']

In [255]:
# Train test split. The labels from the original dataset, that is the ratings.
X_train, X_test, y_train, y_test = train_test_split(df_ratings_pred, df['Rating'], test_size=0.2, random_state=42)

In [256]:
# Linear Regression
reg_model = LinearRegression().fit(X_train, y_train)
reg_predictions = reg_model.predict(X_test)
mean_squared_error(y_test,reg_predictions)

0.02467288464680314

Improvement 1: Making use of features like batter_life, camera etc. from the original dataset. Gives a very slight improvement in mean squared error.

In [257]:
# Features from the original data
df_ratings_pred['battery_life'] = df['battery life']
df_ratings_pred['camera_quality'] = df['camera quality']
df_ratings_pred['screen'] = df['screen']
df_ratings_pred['performance'] = df['performance']
df_ratings_pred['price'] = df['price']

In [258]:
# Train test split. The labels from the original dataset, that is the ratings.
X_train, X_test, y_train, y_test = train_test_split(df_ratings_pred, df['Rating'], test_size=0.2, random_state=42)

In [259]:
# Linear Regression
reg_model = LinearRegression().fit(X_train, y_train)
reg_predictions = reg_model.predict(X_test)
mean_squared_error(y_test,reg_predictions)

0.024601937517523764

Improvement 2: Making use of a different model like decision tree regressor.

In [260]:
# Decision Tree Regressor. Gives us a very good model.
reg_model = DecisionTreeRegressor().fit(X_train, y_train)
reg_predictions = reg_model.predict(X_test)
mean_squared_error(y_test,reg_predictions)

0.0