In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data cleaning and Feature engineering

In [None]:
df=pd.read_csv("/kaggle/input/womens-ecommerce-clothing-reviews/Womens Clothing E-Commerce Reviews.csv")

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.drop(df[df['Review Text'].isna()].index,inplace=True) #drop where there are no text

In [None]:
blanks = []  # start with an empty list

for i,lb,rv in df[['Review Text','Title']].itertuples():  # iterate over the DataFrame
    if type(rv)==str:            # avoid NaN values
        if rv.isspace():         # test 'review' for whitespace
            blanks.append(i)

In [None]:
blanks #there are no blanks or space instead of NaN

In [None]:
df.info()

In [None]:
#df[df['Rating']==3]

In [None]:
df['Title']=df['Title'].apply(lambda x:" " if pd.isnull(x) else x) #replace null value with a space

In [None]:
df.info()

In [None]:
#df[df['Division Name'].isna()]

In [None]:
df['Division Name'].fillna(df['Division Name'].mode()[0],inplace=True) # replace nan with most common value that occur

In [None]:
df['Department Name'].fillna(df['Department Name'].mode()[0],inplace=True)

In [None]:
df['Class Name'].fillna(df['Class Name'].mode()[0],inplace=True)

In [None]:
df.info() # data types are fine plus there are no null values left

In [None]:
df.head()

In [None]:
df['Title-Review Text']=df[['Title', 'Review Text']].apply(lambda x: ' '.join(x), axis=1)

In [None]:
df.head(6)

In [None]:
df.drop("Unnamed: 0",axis=1,inplace=True)

In [None]:
df['Review']=df['Rating'].apply(lambda x: "positive" if x>3 else("negative" if x<3 else("neutral" if x==3 else x)))

In [None]:
df.head()

In [None]:
df=df.sort_values("Clothing ID")
df.reset_index(drop=True,inplace=True)

In [None]:
df.iloc[18111:,:]['Clothing ID'].unique()

# Eda

In [None]:
from wordcloud import WordCloud, ImageColorGenerator, STOPWORDS
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [None]:
#import nltk
#nltk.download("stopwords")
#nltk.download('punkt')

In [None]:
"""
Reference from Ken Jee : https://github.com/PlayingNumbers/ds_salary_proj
"""
words = " ".join(df['Title-Review Text'][df['Review']=="positive"])

def punctuation_stop(text):
    """remove punctuation and stop words"""
    filtered = []
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text)
    for w in word_tokens:
        if w not in stop_words and w.isalpha():
            filtered.append(w.lower())
    return filtered


words_filtered = punctuation_stop(words)


text = " ".join([ele for ele in words_filtered])

wc= WordCloud(background_color="white", random_state=1,stopwords=STOPWORDS, max_words = 2000, width =1000, height = 1500)
wc.generate(text)

plt.figure(figsize=[10,10])
plt.imshow(wc,interpolation="bilinear")
plt.axis('off')
plt.show()

The words like love, top, perfect, great, etc are used for positive reviews 

In [None]:
words = " ".join(df['Title-Review Text'][df['Review']=="negative"])

words_filtered = punctuation_stop(words)


text = " ".join([ele for ele in words_filtered])

wc= WordCloud(background_color="white", random_state=1,stopwords=STOPWORDS, max_words = 2000, width =1000, height = 1500)
wc.generate(text)

plt.figure(figsize=[10,10])
plt.imshow(wc,interpolation="bilinear")
plt.axis('off')
plt.show()

Here we can see that there are word which look like complaint like fitting, material, look, size, etc are used

In [None]:
words = " ".join(df['Title-Review Text'][df['Review']=="neutral"])

words_filtered = punctuation_stop(words)


text = " ".join([ele for ele in words_filtered])

wc= WordCloud(background_color="white", random_state=1,stopwords=STOPWORDS, max_words = 2000, width =1000, height = 1500)
wc.generate(text)

plt.figure(figsize=[10,10])
plt.imshow(wc,interpolation="bilinear")
plt.axis('off')
plt.show()

This look like it has both complain as well as compliments

In [None]:
sns.barplot(x="Review",y="Age",data=df)

looks like people of every age has reviewd equally at an average

In [None]:
sns.barplot(x="Review",y="Recommended IND",data=df)

as we know that when the review are positive tend to recommend more then neutral then negative

In [None]:
sns.barplot(x="Review",y="Positive Feedback Count",data=df)

we can see that positive feedback count vary slightly

# Data Preprocessing

In [None]:
df.reset_index(drop=True,inplace=True)

In [None]:
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import re
ps = PorterStemmer()
corpus = []
for i in range(0, len(df)):
    review = re.sub('[^a-zA-Z]', ' ', df.loc[i,'Title-Review Text'])
    review = review.lower()
    review = review.split()
    
    review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
df['Title-Review Text']=corpus

In [None]:
df.columns

In [None]:
df['Review']=df['Review'].apply(lambda x:0 if x=="negative" else(2 if x=='positive' else(1 if x=='neutral' else x)))

In [None]:
#df=df.sort_values("Clothing ID")
#df.reset_index(drop=True,inplace=True)

In [None]:
words=df['Title-Review Text']
y=df['Review']

In [None]:
#words

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_v=TfidfVectorizer(max_features=3000,ngram_range=(1,3))
words=tfidf_v.fit_transform(words).toarray()

In [None]:
X_train=words[:18111]
X_test=words[18111:]
y_train=y[:18111].values
y_test=y[18111:].values

In [None]:
#pd.DataFrame(words).to_csv("words.csv")

# ML Algorithms

## Gaussian Naive bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred=classifier.predict(X_test)

In [None]:
from sklearn.metrics import classification_report
print("Classification Report:\n ", classification_report(y_test, y_pred))

## Multinominal Naive bayes

In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

In [None]:
y_pred=classifier.predict(X_test)
from sklearn.metrics import classification_report
print("Classification Report:\n ", classification_report(y_test, y_pred))

## Logistics Regression

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight="balanced")
classifier.fit(X_train, y_train)

In [None]:
y_pred=classifier.predict(X_test)
from sklearn.metrics import classification_report
print("Classification Report:\n ", classification_report(y_test, y_pred))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier()
classifier.fit(X_train, y_train)

In [None]:
y_pred=classifier.predict(X_test)
from sklearn.metrics import classification_report
print("Classification Report:\n ", classification_report(y_test, y_pred))

## XGboost

In [None]:
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

In [None]:
y_pred=classifier.predict(X_test)
from sklearn.metrics import classification_report
print("Classification Report:\n ", classification_report(y_test, y_pred))

###### As logistic regression has a better accuracy w.r.t 0 and 1 we will use logistic regression

In [None]:
#from imblearn.combine import SMOTETomek

In [None]:
#smk = SMOTETomek(random_state=42)

In [None]:
#X_train_res,y_train_res=smk.fit_sample(X_train,y_train)

In [None]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(class_weight="balanced")
classifier.fit(X_train, y_train)

In [None]:
y_pred=classifier.predict(X_test)
from sklearn.metrics import classification_report
print("Classification Report:\n ", classification_report(y_test, y_pred))

In [None]:
#import pickle 
#pickle.dump(classifier,open("model.pkl","wb"))

## Creating Pipeline

In [None]:
#from sklearn.pipeline import Pipeline
#from sklearn.feature_extraction.text import TfidfVectorizer
#from sklearn.linear_model import LogisticRegression
#text_clf_nb = Pipeline([('tfidf', TfidfVectorizer(max_features=3000,ngram_range=(1,3))),
#                     ('clf', LogisticRegression(class_weight="balanced")),
#])
#text_clf_nb.fit(X_train,y_train)

In [None]:
#import pickle 
#pickle.dump(text_clf_nb,open("model1.pkl","wb"))

In [None]:
#text_clf_nb.predict(X_test)

In [None]:
#df=pd.read_csv("women-clothing.csv")
#load=pickle.load(open('model1.pkl','rb'))