In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

In [2]:
data = pd.read_csv('../Resources/beauty_reviews.csv')
data

Unnamed: 0,reviewText,label
0,its up to you but as per the poppyaustin.com s...,0
1,I didn't have this item in my home for more th...,0
2,Description states there should be 144 tattoos...,0
3,"Agreed, this hair is nothing like what you act...",0
4,Not quite what I expected!,0
...,...,...
99995,This is very good for eczema and even more eff...,1
99996,Love the shimmer and a chap stick as well.,1
99997,Thanks!!!,1
99998,"Love it, and have been wearing this for 30 years",1


In [3]:
data['pre_process'] = data['reviewText'].apply(lambda x: ' '.join(x.lower() for x in str(x).split()))

In [4]:
from bs4 import BeautifulSoup
data['pre_process']=data['pre_process'].apply(lambda x: BeautifulSoup(x).get_text())

data['pre_process']=data['pre_process'].apply(lambda x: re.sub(r'http\S+', '', x))



In [5]:
def process_text(doc):
    sw = set(stopwords.words('english'))
    regex = re.compile("[^a-zA-Z ]")
    re_clean = regex.sub('', doc)
    lemmatizer = WordNetLemmatizer()
    lem = [lemmatizer.lemmatize(w) for w in word_tokenize(re_clean)]
    output = [word.lower() for word in lem if word.lower() not in sw]
    return ' '.join(output)
data['pre_process'] = data['pre_process'].apply(lambda x: process_text(x))

In [6]:
data

Unnamed: 0,reviewText,label,pre_process
0,its up to you but as per the poppyaustin.com s...,0,per poppyaustincom saysfor avoidance doubt pro...
1,I didn't have this item in my home for more th...,0,didnt item home two hour onoff button wouldnt ...
2,Description states there should be 144 tattoos...,0,description state tattoo received disappointed...
3,"Agreed, this hair is nothing like what you act...",0,agreed hair nothing like actually receive clea...
4,Not quite what I expected!,0,quite expected
...,...,...,...
99995,This is very good for eczema and even more eff...,1,good eczema even effective used shower skin st...
99996,Love the shimmer and a chap stick as well.,1,love shimmer chap stick well
99997,Thanks!!!,1,thanks
99998,"Love it, and have been wearing this for 30 years",1,love wearing year


In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['pre_process'], data['label'], test_size=0.25, random_state=30)
print(f'Train: {X_train.shape,y_train.shape}\nTest: {X_test.shape,y_test.shape}')

Train: ((75000,), (75000,))
Test: ((25000,), (25000,))


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer= TfidfVectorizer()
tf_X_train = vectorizer.fit_transform(X_train)
tf_X_test = vectorizer.transform(X_test)

In [11]:
tf_X_train[0].toarray().sum()

5.051536498556645

In [12]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(max_iter=1000,solver='saga')

In [13]:
clf.fit(tf_X_train,y_train)

LogisticRegression(max_iter=1000, solver='saga')

In [14]:
y_test_pred=clf.predict(tf_X_test)

In [15]:
from sklearn.metrics import classification_report
report = pd.DataFrame(classification_report(y_test_pred, 
                                        y_test, digits=2,
                                        output_dict=True)).T

report['support'] = report.support.apply(int)

report.style.background_gradient(cmap='RdYlGn',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])

Unnamed: 0,precision,recall,f1-score,support
0,0.890665,0.880582,0.885595,12720
1,0.877737,0.888029,0.882853,12280
accuracy,0.88424,0.88424,0.88424,0
macro avg,0.884201,0.884306,0.884224,25000
weighted avg,0.884314,0.88424,0.884248,25000


In [16]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=0)

In [17]:
clf.fit(tf_X_train,y_train)

LinearSVC(random_state=0)

In [18]:
y_test_pred=clf.predict(tf_X_test)

In [19]:
from sklearn.metrics import classification_report
report = pd.DataFrame(classification_report(y_test_pred, 
                                        y_test, digits=2,
                                        output_dict=True)).T

report['support'] = report.support.apply(int)

report.style.background_gradient(cmap='RdYlGn',
                             subset=pd.IndexSlice['0':'9', :'f1-score'])

Unnamed: 0,precision,recall,f1-score,support
0,0.88645,0.880708,0.88357,12658
1,0.878461,0.884298,0.88137,12342
accuracy,0.88248,0.88248,0.88248,0
macro avg,0.882456,0.882503,0.88247,25000
weighted avg,0.882506,0.88248,0.882484,25000


In [22]:
new_string = "This product is not very bad or gross and I do not not like it at all"

processed_new = process_text(new_string)

tfidf_new = vectorizer.transform([processed_new])

new_predict = clf.predict(tfidf_new)
new_predict

array([0], dtype=int64)

In [21]:
new_string = "This product is very good and awesome and I am very happy with my purchase"

processed_new = process_text(new_string)

tfidf_new = vectorizer.transform([processed_new])

new_predict = clf.predict(tfidf_new)
new_predict

array([1], dtype=int64)