In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('/content/amazon.csv')

In [3]:
data.head()

Unnamed: 0,reviewText,Positive
0,This is a one of the best apps acording to a b...,1
1,This is a pretty good version of the game for ...,1
2,this is a really cool game. there are a bunch ...,1
3,"This is a silly game and can be frustrating, b...",1
4,This is a terrific game on any pad. Hrs of fun...,1


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   reviewText  20000 non-null  object
 1   Positive    20000 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 312.6+ KB


In [5]:
data.isnull().sum()

Unnamed: 0,0
reviewText,0
Positive,0


In [6]:
data = data.dropna(subset=['reviewText', 'Positive'])

In [7]:
# Importing libraries for text preprocessing
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

# Defining text preprocessing function
def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])  # Remove stopwords
    return text

# Applying text preprocessing
data['cleaned_reviewText'] = data['reviewText'].apply(preprocess_text)

# Displaying the cleaned data
data[['reviewText', 'cleaned_reviewText']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,reviewText,cleaned_reviewText
0,This is a one of the best apps acording to a b...,one best apps acording bunch people agree bomb...
1,This is a pretty good version of the game for ...,pretty good version game free lots different l...
2,this is a really cool game. there are a bunch ...,really cool game bunch levels find golden eggs...
3,"This is a silly game and can be frustrating, b...",silly game frustrating lots fun definitely rec...
4,This is a terrific game on any pad. Hrs of fun...,terrific game pad hrs fun grandkids love great...


In [8]:

# Defining features (X) and target (y)
X = data['cleaned_reviewText']
y = data['Positive']

# Splitting the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Initializing TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)

# Transforming the training and testing data
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

# Initializing models
log_reg = LogisticRegression()
random_forest = RandomForestClassifier()
svm = SVC()

# Dictionary to store the models
models = {'Logistic Regression': log_reg, 'Random Forest': random_forest, 'SVM': svm}

In [11]:
# Training each model
for model_name, model in models.items():
    model.fit(X_train_tfidf, y_train)
    print(f"{model_name} trained successfully")

Logistic Regression trained successfully
Random Forest trained successfully
SVM trained successfully


In [12]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# Function to evaluate models
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Confusion Matrix: \n{cm}\n")

# Evaluating each model
for model_name, model in models.items():
    print(f"Evaluating {model_name}")
    evaluate_model(model, X_test_tfidf, y_test)

Evaluating Logistic Regression
Accuracy: 0.8925
Precision: 0.8999
Recall: 0.9661
F1 Score: 0.9318
Confusion Matrix: 
[[ 631  327]
 [ 103 2939]]

Evaluating Random Forest
Accuracy: 0.8725
Precision: 0.8850
Recall: 0.9566
F1 Score: 0.9194
Confusion Matrix: 
[[ 580  378]
 [ 132 2910]]

Evaluating SVM
Accuracy: 0.8938
Precision: 0.9030
Recall: 0.9638
F1 Score: 0.9324
Confusion Matrix: 
[[ 643  315]
 [ 110 2932]]



In [13]:
from sklearn.model_selection import GridSearchCV

# Defining hyperparameters for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

# Initializing Grid Search
grid_search = GridSearchCV(random_forest, param_grid, cv=3, n_jobs=-1, verbose=2)

# Fitting Grid Search
grid_search.fit(X_train_tfidf, y_train)

# Best parameters
print("Best parameters found by Grid Search:", grid_search.best_params_)

Fitting 3 folds for each of 27 candidates, totalling 81 fits
Best parameters found by Grid Search: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 200}


In [14]:
# Importing necessary libraries for evaluation metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Function to evaluate and return metrics
def get_metrics(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Dictionary to store the evaluation metrics of each model
model_performance = {}

# Evaluating each model
for model_name, model in models.items():
    accuracy, precision, recall, f1 = get_metrics(model, X_test_tfidf, y_test)
    model_performance[model_name] = [accuracy, precision, recall, f1]

# Converting the results into a pandas DataFrame for easy comparison
performance_df = pd.DataFrame(model_performance, index=['Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Transposing the DataFrame for a better view
performance_df = performance_df.T

# Displaying the performance comparison
print(performance_df)

                     Accuracy  Precision    Recall  F1 Score
Logistic Regression   0.89250   0.899878  0.966141  0.931833
Random Forest         0.87250   0.885036  0.956607  0.919431
SVM                   0.89375   0.902987  0.963840  0.932422
