In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import string
import re
from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from textblob import TextBlob
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
df = pd.read_csv(r"C:\Users\hp\Desktop\CP2\tinder_review_analysis.csv")

In [3]:
df.head()

Unnamed: 0,Reviews,polarity,Sentiment
0,Terrible keep steal money removed credit card ...,-1.0,Negative
1,One worst could ever use They account reason s...,-0.4,Negative
2,bad,-0.7,Negative
3,Easy,0.433333,Positive
4,still filled fake ugly dont want talk,-0.266667,Negative


In [4]:
df.shape

(65031, 3)

In [4]:
df['Reviews'].fillna('', inplace=True)

# Naive bayes classification

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df['Reviews'], df['Sentiment'],test_size=0.2, random_state=42)
    
# convert text data to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(use_idf=True,
                                   stop_words = 'english',
                                   analyzer = 'word')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [6]:
X_train_tfidf

<52024x7875 sparse matrix of type '<class 'numpy.float64'>'
	with 336030 stored elements in Compressed Sparse Row format>

In [7]:
from sklearn.naive_bayes import BernoulliNB

model = BernoulliNB(alpha=1)  
model.fit(X_train_tfidf, y_train)

In [8]:
y_pred_train_NB= model.predict(X_train_tfidf)
y_pred_test_NB = model.predict(X_test_tfidf)

In [9]:
print("\nClassification Report on Train (Naive-Bayes):\n", classification_report(y_train,y_pred_train_NB))


Classification Report on Train (Naive-Bayes):
               precision    recall  f1-score   support

    Negative       0.76      0.67      0.72     12677
    Positive       0.90      0.93      0.91     39347

    accuracy                           0.87     52024
   macro avg       0.83      0.80      0.82     52024
weighted avg       0.87      0.87      0.87     52024



In [10]:
print("\nClassification Report on Test (Naive-Bayes):\n", classification_report(y_test,y_pred_test_NB))


Classification Report on Test (Naive-Bayes):
               precision    recall  f1-score   support

    Negative       0.68      0.60      0.64      3134
    Positive       0.88      0.91      0.89      9873

    accuracy                           0.84     13007
   macro avg       0.78      0.75      0.77     13007
weighted avg       0.83      0.84      0.83     13007



# Desicion Tree

In [11]:
params = {'max_depth': [2,3,4,5], 
          'min_samples_split': [100, 150, 200, 250, 225],
          'min_samples_leaf': [50, 75, 115, 100]}

In [12]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
gridcv = GridSearchCV(DecisionTreeClassifier(), 
                      params, 
                      verbose = 1,
                      cv = 10)
gridcv.fit(X_train_tfidf, y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [13]:
from sklearn.tree import DecisionTreeClassifier
ModelDT = DecisionTreeClassifier(**gridcv.best_params_)

In [14]:
ModelDT.fit(X_train_tfidf, y_train)

In [15]:
y_pred_train_DT = ModelDT.predict(X_train_tfidf)
y_pred_test_DT = ModelDT.predict(X_test_tfidf)

In [16]:
print("\nClassification Report on Train (Decision Tree):\n", classification_report(y_train, y_pred_train_DT))


Classification Report on Train (Decision Tree):
               precision    recall  f1-score   support

    Negative       0.81      0.37      0.51     12677
    Positive       0.83      0.97      0.89     39347

    accuracy                           0.83     52024
   macro avg       0.82      0.67      0.70     52024
weighted avg       0.82      0.83      0.80     52024



In [17]:
print("\nClassification Report on Test (Decision Tree):\n", classification_report(y_test, y_pred_test_DT))


Classification Report on Test (Decision Tree):
               precision    recall  f1-score   support

    Negative       0.81      0.36      0.50      3134
    Positive       0.83      0.97      0.89      9873

    accuracy                           0.83     13007
   macro avg       0.82      0.67      0.70     13007
weighted avg       0.82      0.83      0.80     13007



# Random forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
gridcv = GridSearchCV(RandomForestClassifier(), 
                      params, 
                      verbose = 1,
                      cv = 10)
gridcv.fit(X_train_tfidf, y_train)

Fitting 10 folds for each of 80 candidates, totalling 800 fits


In [19]:
from sklearn.ensemble import RandomForestClassifier
ModelRF = RandomForestClassifier(**gridcv.best_params_, 
                                 n_estimators = 25, 
                                 criterion = 'gini', 
                                 max_features='sqrt'
  )

In [20]:
ModelRF.fit(X_train_tfidf, y_train)

In [21]:
y_pred_train_RF = ModelRF.predict(X_train_tfidf)
y_pred_test_RF = ModelRF.predict(X_test_tfidf)

In [22]:
print("\nClassification Report on Train (Random Forest):\n", classification_report(y_train, y_pred_train_RF))


Classification Report on Train (Random Forest):
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00     12677
    Positive       0.76      1.00      0.86     39347

    accuracy                           0.76     52024
   macro avg       0.38      0.50      0.43     52024
weighted avg       0.57      0.76      0.65     52024



In [23]:
print("\nClassification Report on Test (Random Forest):\n", classification_report(y_test, y_pred_test_RF))


Classification Report on Test (Random Forest):
               precision    recall  f1-score   support

    Negative       0.00      0.00      0.00      3134
    Positive       0.76      1.00      0.86      9873

    accuracy                           0.76     13007
   macro avg       0.38      0.50      0.43     13007
weighted avg       0.58      0.76      0.66     13007



# Accuracy comparison

In [24]:
from sklearn.metrics import accuracy_score
def model_evaluation(prediction, actual):
    accuracy = accuracy_score(prediction, actual)
    return accuracy

In [25]:
nb_train = model_evaluation(y_pred_train_NB, y_train)
nb_test = model_evaluation(y_pred_test_NB, y_test)
dt_train = model_evaluation(y_pred_train_DT, y_train)
dt_test = model_evaluation(y_pred_test_DT, y_test)
rf_train = model_evaluation(y_pred_train_RF, y_train)
rf_test = model_evaluation(y_pred_test_RF, y_test)

In [27]:
print(f"Accuracy of the 'Naive-Bayes Train Model' is : {np.round(nb_train*100)}%")
print(f"Accuracy of the 'Naive-Bayes Test Model' is : {np.round(nb_test*100)}%")
print()

print(f"Accuracy of the 'Decision Tree Train Model' is : {np.round(dt_train*100)}%")
print(f"Accuracy of the 'Decision Tree Test Model' is : {np.round(dt_test*100)}%")
print()

print(f"Accuracy of the 'Random Forest Train Model' is : {np.round(rf_train*100)}%")
print(f"Accuracy of the 'Random Forest Test Model' is : {np.round(rf_test*100)}%")
print()

Accuracy of the 'Naive-Bayes Train Model' is : 87.0%
Accuracy of the 'Naive-Bayes Test Model' is : 84.0%

Accuracy of the 'Decision Tree Train Model' is : 83.0%
Accuracy of the 'Decision Tree Test Model' is : 83.0%

Accuracy of the 'Random Forest Train Model' is : 76.0%
Accuracy of the 'Random Forest Test Model' is : 76.0%

