In [1]:
import pandas as pd
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score,confusion_matrix,recall_score,f1_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
reviews_df=pd.read_csv('Amazon_Reviews.csv')
y=reviews_df['Label']
reviews_df.drop(columns='Label',inplace=True)

In [3]:
regexp=RegexpTokenizer(r'\w+')
stopwords_en=stopwords.words('english')
lemmatizer=WordNetLemmatizer()      
vectorizer=TfidfVectorizer()

In [5]:
X_train,X_test,y_train,y_test=train_test_split(reviews_df,y,test_size=0.2,random_state=42)

In [6]:
vectorizer.decode(X_train['Review'][0])

' Stuning even for the non-gamer: This sound track was beautiful! It paints the senery in your mind so well I would recomend it even to people who hate video game music! I have played the game Chrono Cross but out of all of the games I have ever played it has the best music! It backs away from crude keyboarding and takes a fresher step with grate guitars and soulful orchestras. It would impress anyone who cares to listen! ^_^'

In [7]:
def preprocessing(review):
    
    tokens=regexp.tokenize(review)
    
    pure_tokens=[token.lower() for token in tokens if token.lower() not in stopwords_en]
    
    lemma_tokens=[lemmatizer.lemmatize(pure_token,pos='v') for pure_token in pure_tokens]
    
    return ' '.join(lemma_tokens)

In [8]:
X_train['Review']=X_train['Review'].apply(preprocessing)

X_test['Review']=X_test['Review'].apply(preprocessing)

In [9]:
X_train_tfidf=vectorizer.fit_transform(X_train['Review'])


X_test_tfidf=vectorizer.transform(X_test['Review'])

In [10]:
logreg=LogisticRegression()
logreg.fit(X_train_tfidf,y_train)

logreg_pred=logreg.predict(X_test_tfidf)

In [11]:
confusion_matrix(y_test,logreg_pred)

array([[ 9,  6],
       [ 1, 24]], dtype=int64)

In [20]:
X_train_tfidf

<159x2348 sparse matrix of type '<class 'numpy.float64'>'
	with 5522 stored elements in Compressed Sparse Row format>

In [22]:
X_train_tfidf_converted=X_train_tfidf.toarray()
X_test_tfidf_converted=X_test_tfidf.toarray()

In [23]:
from sklearn.naive_bayes import GaussianNB
gnb=GaussianNB()
gnb.fit(X_train_tfidf_converted,y_train)
gnb.predict(X_test_tfidf_converted)

array([1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0], dtype=int64)