## Loading basic libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings(action="ignore")

## Loading Dataset

In [2]:
data=pd.read_csv("Downloads/review.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


## Data Pre-Processing
    Merging all text reviews into one feature
    Converting all 5 Star based rating into binary 0 and 1

In [3]:
df=data.copy()
df["Review"]=(df["Title"].map(str).replace("nan","")+df["Review Text"].map(str)).apply(lambda row : row.strip())
df["Rating"]=[1 if rating>3 else 0 for rating in df["Rating"]]
df=df[["Review","Rating"]]
df.head()

Unnamed: 0,Review,Rating
0,Absolutely wonderful - silky and sexy and comf...,1
1,Love this dress! it's sooo pretty. i happene...,1
2,Some major design flawsI had such high hopes f...,0
3,"My favorite buy!I love, love, love this jumpsu...",1
4,Flattering shirtThis shirt is very flattering ...,1


## Dropping the records without review

In [4]:
df=df[df["Review"]!=""]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 23486 entries, 0 to 23485
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  23486 non-null  object
 1   Rating  23486 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 550.5+ KB


## Checking class imbalance

In [5]:
df["Rating"].value_counts()

1    18208
0     5278
Name: Rating, dtype: int64

## Separating train and test data

In [6]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(df["Review"],df["Rating"],random_state=42)
x_train.shape,x_test.shape

((17614,), (5872,))

## Checking class imbalance in x_train and x_test

In [7]:
from collections import Counter
Counter(y_train),Counter(y_test)

(Counter({1: 13703, 0: 3911}), Counter({1: 4505, 0: 1367}))

## CASE 1:Doing Classification using basic NLP count based features

In [8]:
import string

X_train=pd.DataFrame(x_train,columns=["Review"])
X_test=pd.DataFrame(x_test,columns=["Review"])

#creating new features in x_train dataset

X_train["char_count"]=X_train["Review"].apply(len)
X_train["word_count"]=X_train["Review"].apply(lambda x:len(x.split()))
X_train["word_density"]=X_train["char_count"]/(X_train["word_count"]+1)
X_train["punctuation_count"]=X_train["Review"].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation)))
X_train["title_word_count"]=X_train['Review'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_train["upper_case_word_count"]= X_train['Review'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

# creating the same new features in x_test dataset

X_test['char_count'] = X_test['Review'].apply(len)
X_test['word_count'] = X_test['Review'].apply(lambda x: len(x.split()))
X_test['word_density'] = X_test['char_count'] / (X_test['word_count']+1)
X_test['punctuation_count'] = X_test['Review'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
X_test['title_word_count'] = X_test['Review'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
X_test['upper_case_word_count'] = X_test['Review'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

## Training a Logistic Regression model on this new set of features

In [9]:
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(random_state=42,solver='liblinear')

In [10]:
model.fit(X_train.drop(columns=["Review"]),y_train)
y_pred=model.predict(X_test.drop(columns=["Review"]))

## Evaluating Model Performance

In [11]:
from sklearn.metrics import confusion_matrix,classification_report
print(classification_report(y_test,y_pred))
cm=confusion_matrix(y_test,y_pred)
pd.DataFrame(cm,columns=["0","1"])

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      1367
           1       0.77      1.00      0.87      4505

    accuracy                           0.77      5872
   macro avg       0.38      0.50      0.43      5872
weighted avg       0.59      0.77      0.67      5872



Unnamed: 0,0,1
0,0,1367
1,0,4505


In [12]:
# not a good result as all are being classified as class 1 so we need some new features to learn.As currently our model
# is not learning anything from features and is classifying every product as good

## CASE 2:Taking advantage of text sentiment to create some new features(sentiment analysis)

In [13]:
import textblob

#creating sentiment based features for X_train dataset

x_train_snt_obj=X_train["Review"].apply(lambda data:textblob.TextBlob(data).sentiment)
X_train["Polarity"]=[obj.polarity for obj in x_train_snt_obj.values]
X_train["Subjectivity"]=[obj.subjectivity for obj in x_train_snt_obj.values]

#creating sentiment based features for X_test dataset

x_test_snt_obj=X_test["Review"].apply(lambda data:textblob.TextBlob(data).sentiment)
X_test["Polarity"]=[obj.polarity for obj in x_test_snt_obj.values]
X_test["Subjectivity"]=[obj.subjectivity for obj in x_test_snt_obj.values]

## Training Model on these new set of features

In [14]:
model.fit(X_train.drop(columns=["Review"]),y_train)
y_pred=model.predict(X_test.drop(columns=["Review"]))

## Evaluating Model Performance

In [15]:
print(classification_report(y_test,y_pred))
cm=confusion_matrix(y_test,y_pred)
pd.DataFrame(cm,columns=["0","1"])

              precision    recall  f1-score   support

           0       0.68      0.20      0.31      1367
           1       0.80      0.97      0.88      4505

    accuracy                           0.79      5872
   macro avg       0.74      0.59      0.60      5872
weighted avg       0.77      0.79      0.75      5872



Unnamed: 0,0,1
0,279,1088
1,133,4372


In [16]:
# Now we cam see that we are able to classify bad products with recall 20% and precision is also good at 68% for class 0
#lets see if we can do more better or not

## CASE 3:Using Bag of Words based features 

### Text Pre-Processing

In [17]:
import nltk
import contractions
import re

# removing some stop words to capture negations in n-grams
stop_words=nltk.corpus.stopwords.words("english")
stop_words.remove('no')
stop_words.remove('not')
stop_words.remove('but')

In [18]:
ps=nltk.porter.PorterStemmer()

#creating pre-processing function

def pre_processor(text):
    
    # convert to lower case
    text=str(text).lower()
    # fixing contractions
    text=contractions.fix(text)
    # removing unnecessary characters
    text=re.sub(r"[^a-zA-Z]"," ",text)
    text=re.sub(r"nbsp","",text)
    text=re.sub(" +"," ",text)
    # simple porter stemming
    text=" ".join([ps.stem(word) for word in text.split()])
    # removing stop words
    text=" ".join([word for word in text.split() if word not in stop_words])
    
    return text
processor=np.vectorize(pre_processor)   

In [19]:
X_train["clean_review"]=processor(X_train["Review"].values)
X_test["clean_review"]=processor(X_test["Review"].values)

In [20]:
X_train.head()

Unnamed: 0,Review,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity,clean_review
22348,Perfect fitI am not a one-piece body type. thi...,264,50,5.176471,11,1,0,0.598661,0.467946,perfect fiti not one piec bodi type thi best b...
19754,Well qualityThis dress is made well. love the ...,180,35,5.0,4,1,0,0.14,0.32,well qualitythi dress made well love design co...
7628,Soft and flattering!This is a very flattering ...,520,104,4.952381,13,1,0,0.223873,0.485579,soft flatter thi veri flatter blous but note h...
10873,My new favoriteI love everything about this dr...,376,76,4.883117,7,1,0,0.446645,0.584986,new favoritei love everyth thi dress beauti vi...
11519,Doesn't look the same as pictureI was so excit...,532,105,5.018868,9,0,0,0.210417,0.497917,doe not look picturei wa excit thi coat saw ca...


### Separating structured features

In [21]:
X_train_metadata=X_train.drop(columns=["Review","clean_review"]).reset_index(drop=True)
X_test_metadata=X_test.drop(columns=["Review","clean_review"]).reset_index(drop=True)
X_train_metadata.head()

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity
0,264,50,5.176471,11,1,0,0.598661,0.467946
1,180,35,5.0,4,1,0,0.14,0.32
2,520,104,4.952381,13,1,0,0.223873,0.485579
3,376,76,4.883117,7,1,0,0.446645,0.584986
4,532,105,5.018868,9,0,0,0.210417,0.497917


### Creating BOW based features:1-grams

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(min_df=0.0,max_df=1.0,ngram_range=(1,1))

In [23]:
x_train_cv=cv.fit_transform(X_train["clean_review"]).toarray()
x_traincv=pd.DataFrame(x_train_cv,columns=cv.get_feature_names())

In [24]:
x_testcv = cv.transform(X_test['clean_review']).toarray()
x_testcv = pd.DataFrame(x_testcv, columns=cv.get_feature_names())

In [25]:
x_traincv.head()

Unnamed: 0,aa,aaaaandidon,aaaaannnnnnd,aam,ab,abbey,abbi,abdomen,abercrombi,abhor,...,zippersi,zipperso,zipperthi,zippi,zoe,zombi,zone,zooland,zoom,zuma
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [26]:
x_train_comb=pd.concat([X_train_metadata,x_traincv],axis=1)
x_test_comb=pd.concat([X_test_metadata,x_testcv],axis=1)

In [27]:
x_test_comb.head()

Unnamed: 0,char_count,word_count,word_density,punctuation_count,title_word_count,upper_case_word_count,Polarity,Subjectivity,aa,aaaaandidon,...,zippersi,zipperso,zipperthi,zippi,zoe,zombi,zone,zooland,zoom,zuma
0,511,97,5.214286,30,1,0,0.094196,0.419048,0,0,...,0,0,0,0,0,0,0,0,0,0
1,108,20,5.142857,1,1,0,0.343333,0.585,0,0,...,0,0,0,0,0,0,0,0,0,0
2,81,14,5.4,2,0,0,0.33125,0.525,0,0,...,0,0,0,0,0,0,0,0,0,0
3,522,97,5.326531,15,1,0,-0.02803,0.490909,0,0,...,0,0,0,0,0,0,0,0,0,0
4,412,81,5.02439,17,1,0,0.275,0.569379,0,0,...,0,0,0,0,0,0,0,0,0,0


## Training Model

In [28]:
model.fit(x_train_comb,y_train)
y_pred=model.predict(x_test_comb)

## Evaluating Model Performance

In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.67      0.72      1367
           1       0.90      0.94      0.92      4505

    accuracy                           0.88      5872
   macro avg       0.84      0.81      0.82      5872
weighted avg       0.87      0.88      0.88      5872



In [30]:
cm=confusion_matrix(y_test,y_pred)
pd.DataFrame(cm,columns=["0","1"])

Unnamed: 0,0,1
0,919,448
1,263,4242


In [31]:
#Now we can see our model is doing better than before f1 score is 72% for bad reviews and 92% for good reviews and 
#overall its 88% which is quite good