In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,confusion_matrix,recall_score,f1_score,classification_report

import warnings
warnings.filterwarnings('ignore')

In [3]:
reviews_df=pd.read_csv('Amazon_Reviews.csv')
y=reviews_df['Label']
reviews_df.drop(columns='Label',inplace=True)
reviews_df

Unnamed: 0,Review
0,Stuning even for the non-gamer: This sound tr...
1,The best soundtrack ever to anything.: I'm re...
2,Amazing!: This soundtrack is my favorite musi...
3,Excellent Soundtrack: I truly like this sound...
4,"Remember, Pull Your Jaw Off The Floor After H..."
...,...
194,A Book That Is Worth a Second Look: This book...
195,Best game ever: This games makes even amazing...
196,Guitar in Absentia: With all due respect to a...
197,Stiff and Smells like drying paint: You get w...


In [10]:
regexp=RegexpTokenizer(r'\w+')
stopwords_en=stopwords.words('english')
lemmatizer=WordNetLemmatizer()      
vectorizer=TfidfVectorizer()

In [12]:
X_train,X_test,y_train,y_test=train_test_split(reviews_df,y,test_size=0.2,random_state=42)

In [13]:
def preprocessing(review):
    
    tokens=regexp.tokenize(review)
    
    pure_tokens=[token.lower() for token in tokens if token.lower() not in stopwords_en]
    
    lemma_tokens=[lemmatizer.lemmatize(pure_token,pos='v') for pure_token in pure_tokens]
    
    return ' '.join(lemma_tokens)

In [14]:
X_train['Review']=X_train['Review'].apply(preprocessing)

X_test['Review']=X_test['Review'].apply(preprocessing)

In [15]:
X_train_tfidf=vectorizer.fit_transform(X_train['Review'])


X_test_tfidf=vectorizer.transform(X_test['Review'])

In [16]:
logreg=LogisticRegression()
logreg.fit(X_train_tfidf,y_train)

logreg_pred=logreg.predict(X_test_tfidf)

In [17]:
confusion_matrix(y_test,logreg_pred)

array([[ 9,  6],
       [ 1, 24]], dtype=int64)

In [21]:
print(classification_report(y_test,logreg_pred))

              precision    recall  f1-score   support

           0       0.90      0.60      0.72        15
           1       0.80      0.96      0.87        25

    accuracy                           0.82        40
   macro avg       0.85      0.78      0.80        40
weighted avg       0.84      0.82      0.82        40

