## Importing neccessary dependencies

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from tqdm.notebook import tqdm
tqdm.pandas()

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('./air_conditioners_reviews.csv')
df.head()

Unnamed: 0,title,description,reviews,ratings
0,Voltas 1.5 Ton 3 Star Split Inverter AC - Whi...,,Review after 1 month.Best in performance!Pros-...,5
1,Voltas 1.5 Ton 3 Star Split Inverter AC - Whi...,,"Voltas - Damn good AC for the 10x10 room , it ...",5
2,Voltas 1.5 Ton 3 Star Split Inverter AC - Whi...,,I bought this one 15 days ago for 32850 rs wit...,4
3,Voltas 1.5 Ton 3 Star Split Inverter AC - Whi...,,I could say that voltas provides you best AC a...,4
4,Voltas 1.5 Ton 3 Star Split Inverter AC - Whi...,,Biggest Fraud I have ever encountered by Flipk...,1


##### Finding if our dataset consists of any duplicates and if any duplicates are found we will remove them

In [3]:
duplicate_rows = df[df.duplicated('reviews')]
duplicate_rows

Unnamed: 0,title,description,reviews,ratings


In [5]:
len(df['title'].unique())

23

In [6]:
df['ratings'].value_counts()

5    2852
4     829
1     774
3     264
2     157
Name: ratings, dtype: int64

In [7]:
df.drop(['title', 'description'], axis=1, inplace=True)
df.head()

Unnamed: 0,reviews,ratings
0,Review after 1 month.Best in performance!Pros-...,5
1,"Voltas - Damn good AC for the 10x10 room , it ...",5
2,I bought this one 15 days ago for 32850 rs wit...,4
3,I could say that voltas provides you best AC a...,4
4,Biggest Fraud I have ever encountered by Flipk...,1


### Text Processing

##### We are using NLP Techniques to clean our texts. For this we are using NLTK dependencies.

In [8]:
!pip install nltk





In [9]:
import re
import nltk
nltk.download('stopwords')
nltk.download('omw-1.4')
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shri0\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shri0\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shri0\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [10]:
mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",
"didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",
"he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",
"I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",
"i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",
"it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",
"mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",
"mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",
"oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",
"she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",
"should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",
"this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",
"there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",
"they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",
"wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",
"we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",
"what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",
"where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",
"why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",
"would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",
"y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",
"you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",
"you're": "you are", "you've": "you have"}

In [11]:
StopWords = set(stopwords.words('english')) - set(['no','not'])
lemmatizer = WordNetLemmatizer() 

def clean_text(text):
    text = text.lower() #converting input to lowercase
    text = re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    text = re.sub(r'\([^)]*\)', '', text) #Removing punctuations and special characters.
    text = re.sub('"','', text) #Removing double quotes.
    text = ' '.join([mapping[t] if t in mapping else t for t in text.split(" ")]) #Replacing contractions.   
    text = re.sub(r"'s\b","",text) #Eliminating apostrophe.
    text = re.sub("[^a-zA-Z0-9]", " ", text) #Removing non-alphabetical & numerical characters
    text = ' '.join([word for word in text.split() if word not in StopWords]) #Removing stopwords.
    text = ' '.join([word for word in text.split() if len(word) >= 3]) #Removing very short words
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    
    return text

In [12]:
df.isnull().sum()

reviews    0
ratings    0
dtype: int64

In [13]:
# applying our function over the column of DataFrame

df["cleaned_reviews"] = df["reviews"].progress_apply(lambda text: clean_text(text))

  0%|          | 0/4876 [00:00<?, ?it/s]

In [14]:
df.head()

Unnamed: 0,reviews,ratings,cleaned_reviews
0,Review after 1 month.Best in performance!Pros-...,5,review monthbest performanceproslow noiseexcel...
1,"Voltas - Damn good AC for the 10x10 room , it ...",5,volta damn good 10x10 room get cold night swit...
2,I bought this one 15 days ago for 32850 rs wit...,4,bought one day ago 32850 400 stabilizer analys...
3,I could say that voltas provides you best AC a...,4,could say volta provides best price segment 30...
4,Biggest Fraud I have ever encountered by Flipk...,1,biggest fraud ever encountered flipkart volta ...


##### Labels :- Negatives as 0 and Positives as 1
##### Considering ratings from (1-3 as Negative) and (4-5 as Positive)

In [15]:
# We have considered doing Binary Classification of the reviews

df['bin_label'] = df['ratings'].map({1: 0, 2:0, 3:0, 4:1, 5:1})
df.head()

Unnamed: 0,reviews,ratings,cleaned_reviews,bin_label
0,Review after 1 month.Best in performance!Pros-...,5,review monthbest performanceproslow noiseexcel...,1
1,"Voltas - Damn good AC for the 10x10 room , it ...",5,volta damn good 10x10 room get cold night swit...,1
2,I bought this one 15 days ago for 32850 rs wit...,4,bought one day ago 32850 400 stabilizer analys...,1
3,I could say that voltas provides you best AC a...,4,could say volta provides best price segment 30...,1
4,Biggest Fraud I have ever encountered by Flipk...,1,biggest fraud ever encountered flipkart volta ...,0


In [16]:
df = df[['cleaned_reviews', 'bin_label']]
df.head()

Unnamed: 0,cleaned_reviews,bin_label
0,review monthbest performanceproslow noiseexcel...,1
1,volta damn good 10x10 room get cold night swit...,1
2,bought one day ago 32850 400 stabilizer analys...,1
3,could say volta provides best price segment 30...,1
4,biggest fraud ever encountered flipkart volta ...,0


In [20]:
df.shape

(4876, 2)

### Vectorizing the Texts and Upsampling the minority classes
##### Inorder to make predictions over the texts we need to convert the texts into a numeric form cause Machines only understand numbers, here we have used TF-IDF Vectorization technique which helps to create a Sparse Vector of Numbers for the corresponding text.
##### As we can see the negative classes are not even half of the Positive Classes, hence in order to get a balanced data we have used an Imbalanced Learning technique which is SMOTE which helps in the SAMPLING of the minority classes.

In [39]:
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_reviews'], df['bin_label'], test_size=0.33, random_state=42)

In [43]:
type(X_train)

pandas.core.series.Series

In [44]:
X_train.shape

(3266,)

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [46]:
type(X_train)

scipy.sparse._csr.csr_matrix

In [47]:
X_train.shape

(3266, 4609)

In [19]:
from collections import Counter
Counter(y_train)

Counter({1: 2474, 0: 792})

In [21]:
### Implementing Imbalanced learning

from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [22]:
Counter(y_train_resampled)

Counter({1: 2474, 0: 2474})

### Training the Model
##### As of now we don't very huge data hence we are using Traditional Machine Learning Algorithms to make predictions. We have used Logistic Regression as we are doing only the Binary Classification of the Reviews.

In [23]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression()

In [24]:
clf.fit(X_train_resampled, y_train_resampled)

### Results
##### We have chosen Accuracy Score and Classification Report as our result metrics and we have got around 90% as Accuracy Score

In [25]:
y_pred = clf.predict(X_test)

In [30]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test, y_pred))
print("\n")
print(f"Acc: {accuracy_score(y_test, y_pred)}")

              precision    recall  f1-score   support

           0       0.81      0.82      0.81       403
           1       0.94      0.93      0.94      1207

    accuracy                           0.91      1610
   macro avg       0.87      0.88      0.88      1610
weighted avg       0.91      0.91      0.91      1610



Acc: 0.9055900621118013


In [27]:
y_pred[:10]

array([1, 1, 1, 1, 1, 1, 0, 0, 1, 1], dtype=int64)

In [28]:
y_test[:10]

1598    1
3945    1
2851    1
4610    1
3745    1
3879    1
501     0
2677    0
3203    1
2342    1
Name: bin_label, dtype: int64

In [32]:
X_test[0].shape

(1, 4609)

In [37]:
X_train

<3266x4609 sparse matrix of type '<class 'numpy.float64'>'
	with 27670 stored elements in Compressed Sparse Row format>

In [36]:
X_train_resampled

<4948x4609 sparse matrix of type '<class 'numpy.float64'>'
	with 53926 stored elements in Compressed Sparse Row format>

### Prediction
##### We have picked some random Reviews of a Product of both the classes and we saw some decent Results.

#### Positive Review prediction

In [34]:
sent = "Great phone in this price range. Excellent performance and good battery back up. Mediatek dimensity 700 performance works well in day to day normal usage. Android 12 is used in the phone with MiUi 13. My father liked it a lot. One disadvantage of phone is little heavy in hand."

In [35]:
sent = clean_text(sent)
sent

'great phone price range excellent performance good battery back mediatek dimensity 700 performance work well day day normal usage android used phone miui father liked lot one disadvantage phone little heavy hand'

In [54]:
s = pd.Series(sent)
s

0    great phone price range excellent performance ...
dtype: object

In [56]:
s_test = vectorizer.transform(s)

In [57]:
clf.predict(s_test)

array([1], dtype=int64)

#### Negative Review prediction

In [60]:
neg_sent = "I am posting this review after using phone about 2 months battery is very bad while using it doesn't working like that 5000 it's like 3000 .. Cameras in day light Are good but in low light it's not upto mark also in several conditions it doesn't take good photos... Processor is good but overall not a good phon .. Generate lots of heating.. Even in general usage I am not a gamer.. Jio 5g is working fine... Overall experience bad... Not satisfied"

In [61]:
neg_sent = clean_text(neg_sent)
neg_sent

'posting review using phone month battery bad using doesnt working like 5000 like 3000 camera day light good low light not upto mark also several condition doesnt take good photo processor good overall not good phon generate lot heating even general usage not gamer jio working fine overall experience bad not satisfied'

In [62]:
neg = pd.Series(neg_sent)
neg

0    posting review using phone month battery bad u...
dtype: object

In [63]:
neg_test = vectorizer.transform(neg)

In [64]:
clf.predict(neg_test)

array([0], dtype=int64)

### Saving Model and Vectorizer as Pickle
#### We have saved our trained classifier and vectorizer as PICKLE format for future use.

In [65]:
import pickle
pickle.dump(clf, open('log_reg.pkl', 'wb'))

In [66]:
pickle.dump(vectorizer, open('vectorizer.pkl', 'wb'))

In [67]:
pickled_model = pickle.load(open('log_reg.pkl', 'rb'))
pickled_model.predict(neg_test)

array([0], dtype=int64)