In [None]:
# Importing Libraries:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [None]:
df = pd.read_csv('/kaggle/input/amazon-reviews-unlocked-mobile-phones/Amazon_Unlocked_Mobile.csv')
print('Total Rows ==',df.shape[0])
print('-'*80)
print('Missing values: ','\n',df.isnull().sum())

#### Since we only need Review and Rating column so let's drop rest of the columns:

In [None]:
# Dropping unecessary columns.
df.drop(['Product Name','Brand Name','Price','Review Votes'], axis=1,inplace=True)

# Missing Values
df = df.dropna() 

### We will classify Rating into 2 classes:
1. Positive (1)
2. Negative (0)

### So, Rating with 1 and 2 will be 0(negative) and rating with 4 and 5 will be 1(positive).

In [None]:
df = df[(df['Rating']<=2) | (df['Rating']>=4)] # Taking all rows other than 3 rating rows.

# Assigning 0 and 1 value to rating column.
df = df.replace({1:0, 2:0, 4:1, 5:1})
df.reset_index(drop=True, inplace=True)
df.head()

### We will be using Tf-idf Vectorizer for text.
It will transform words to tokens, remove stop words and will give ngram range to the text all at once.

In [None]:
trans = TfidfVectorizer(stop_words='english', ngram_range=(1,3), max_features=100000)
X = trans.fit_transform(df['Reviews'])

In [None]:
X_train,X_test, y_train, y_test = train_test_split(X, df['Rating'], test_size=0.15,
                                                  random_state=101)

In [None]:
lr = LogisticRegression(max_iter=10000)
lr.fit(X_train,y_train)

In [None]:
prediction = lr.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,accuracy_score
print(classification_report(y_test, prediction))
print('-'*80)
print('Accuracy', accuracy_score(y_test, prediction))

### Good to go!