In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, cross_val_score, KFold
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pickle
from sklearn.svm import SVC
from sklearn.ensemble import GradientBoostingClassifier
from datetime import datetime

In [2]:
df = pd.read_csv("yelp_dataset.csv")

In [3]:
df.shape

(40000, 7)

In [4]:
df.columns

Index(['Unnamed: 0', 'label', 'reviews', 'review_after_removing_html_tags',
       'review_after_converted_emoji', 'review_after_removing_punctuation',
       'review_after_pos_and_lemmatization'],
      dtype='object')

In [5]:
df.isna().sum()

Unnamed: 0                            0
label                                 0
reviews                               0
review_after_removing_html_tags       0
review_after_converted_emoji          0
review_after_removing_punctuation     2
review_after_pos_and_lemmatization    4
dtype: int64

In [6]:
df.rename(columns={
    'label':'sentiment'
}, inplace=True)

In [7]:
df.dropna(inplace=True)

In [8]:
df["sentiment"] = df.sentiment.apply(lambda x: 0 if x == 1 else 1)

In [9]:
df[ df.sentiment == 1 ].shape

(19999, 7)

In [10]:
df[ df.sentiment == 0 ].shape

(19997, 7)

In [18]:
df = df.reset_index()

In [19]:
ngram_model = CountVectorizer(analyzer='word', ngram_range=(1,3))

X = ngram_model.fit_transform(df["review_after_pos_and_lemmatization"])
y = df["sentiment"]

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [21]:
y_train[ y_train == 1 ].shape

(15957,)

In [22]:
y_train[ y_train == 0 ].shape

(16039,)

In [23]:
y_test[ y_test == 1 ].shape

(4042,)

In [24]:
y_test[ y_test == 0 ].shape

(3958,)

#### KFold cross-validation

In [25]:
# Set up k-fold cross-validation
kf = KFold(n_splits=4, shuffle=True)

random_forest_scores = []
logistic_scores = []
svm_scores = []
gd_boost_scores = []

# Generate indices for each fold
for train_index, test_index in kf.split(X):
    x_train_data, x_test_data = X[train_index], X[test_index]
    y_train_data, y_test_data = y[train_index], y[test_index]

    ## Random Forest
    model_forest = RandomForestClassifier(max_depth=10, n_estimators=500)

    print("Fitting random forest to training data....")    
    model_forest = model_forest.fit(x_train_data, y_train_data)
    
    y_pred_data = model_forest.predict(x_test_data)
    score = accuracy_score(y_test_data, y_pred_data)
    random_forest_scores.append(score)
    print(random_forest_scores)
    
    ## Logistic Regression
    
    model_logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)
    print("Fitting Logistic Regression to training data....")  
    model_logistic_regression = model_logistic_regression.fit(x_train_data, y_train_data)
    
    y_pred_data = model_logistic_regression.predict(x_test_data)
    score = accuracy_score(y_test_data, y_pred_data)
    logistic_scores.append(score)
    print(logistic_scores)
    
    ## SVM
    model_svm = SVC(C=0.1, kernel='linear')
    print("Fitting SVM to training data....")  
    model_svm = model_svm.fit(x_train_data, y_train_data)
    
    y_pred_data = model_svm.predict(x_test_data)
    score = accuracy_score(y_test_data, y_pred_data)
    svm_scores.append(score)
    print(svm_scores)
    
    ## Gradient Boosting
    gb_classifier = GradientBoostingClassifier(random_state=2)
    print("Fitting Gradient Boosting to training data....")    
    gb_classifier.fit(x_train_data, y_train_data)
    
    y_pred_data = gb_classifier.predict(x_test_data)
    score = accuracy_score(y_test_data, y_pred_data)
    gd_boost_scores.append(score)
    print(gd_boost_scores)
    
    print("=======================================================")

# print(f"Cross-validation scores: {scores}")
# print(f"Average score: {np.mean(scores)}")

Fitting random forest to training data....
[0.8084808480848085]
Fitting Logistic Regression to training data....
[0.8982898289828983]
Fitting SVM to training data....
[0.9104910491049105]
Fitting Gradient Boosting to training data....
[0.8473847384738474]
Fitting random forest to training data....
[0.8084808480848085, 0.7833783378337834]
Fitting Logistic Regression to training data....
[0.8982898289828983, 0.9025902590259026]
Fitting SVM to training data....
[0.9104910491049105, 0.9091909190919092]
Fitting Gradient Boosting to training data....
[0.8473847384738474, 0.843984398439844]
Fitting random forest to training data....
[0.8084808480848085, 0.7833783378337834, 0.8135813581358136]
Fitting Logistic Regression to training data....
[0.8982898289828983, 0.9025902590259026, 0.9032903290329033]
Fitting SVM to training data....
[0.9104910491049105, 0.9091909190919092, 0.9150915091509151]
Fitting Gradient Boosting to training data....
[0.8473847384738474, 0.843984398439844, 0.843884388438

In [26]:
print("Random Forest KFold cross-validation")
print(f"Cross-validation scores: {random_forest_scores}")
print(f"Average score: {np.mean(random_forest_scores)}\n")
print("Logistic Regression KFold cross-validation")
print(f"Cross-validation scores: {logistic_scores}")
print(f"Average score: {np.mean(logistic_scores)}\n")
print("SVM cross-validation")
print(f"Cross-validation scores: {svm_scores}")
print(f"Average score: {np.mean(svm_scores)}\n")
print("Gradient Boosting KFold cross-validation")
print(f"Cross-validation scores: {gd_boost_scores}")
print(f"Average score: {np.mean(gd_boost_scores)}\n")

Random Forest KFold cross-validation
Cross-validation scores: [0.8084808480848085, 0.7833783378337834, 0.8135813581358136, 0.7676767676767676]
Average score: 0.7932793279327933

Logistic Regression KFold cross-validation
Cross-validation scores: [0.8982898289828983, 0.9025902590259026, 0.9032903290329033, 0.9042904290429042]
Average score: 0.9021152115211521

SVM cross-validation
Cross-validation scores: [0.9104910491049105, 0.9091909190919092, 0.9150915091509151, 0.9118911891189119]
Average score: 0.9116661666166617

Gradient Boosting KFold cross-validation
Cross-validation scores: [0.8473847384738474, 0.843984398439844, 0.8438843884388438, 0.8472847284728473]
Average score: 0.8456345634563456



#### Random Forest

In [27]:
# Start the timer
start_time = datetime.now()

model_forest = RandomForestClassifier(max_depth=10, n_estimators=500)

print("Fitting random forest to training data....")    
model_forest = model_forest.fit(X_train, y_train)

# Stop the timer
end_time = datetime.now()

# Calculate the runtime
runtime = end_time - start_time

# Print the runtime
print('Random Forest Training Runtime:', runtime)

Fitting random forest to training data....
Random Forest Training Runtime: 0:04:31.578396


In [28]:
model_forest.score(X_test, y_test)

0.799875

In [29]:
model_forest.score(X_train, y_train)

0.8310101262657832

In [30]:
# Start the timer
start_time = datetime.now()

y_pred = model_forest.predict(X_test)

# Stop the timer
end_time = datetime.now()

# Calculate the runtime
runtime = end_time - start_time

# Print the runtime
print('Random Forest Prediction Runtime:', runtime)

Random Forest Prediction Runtime: 0:00:17.315010


In [31]:
y_pred = cross_val_predict(model_forest, X_test, y_test)

In [32]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.56      0.69      3958
           1       0.68      0.93      0.79      4042

    accuracy                           0.75      8000
   macro avg       0.79      0.75      0.74      8000
weighted avg       0.79      0.75      0.74      8000



In [33]:
print(confusion_matrix(y_test, y_pred))

[[2212 1746]
 [ 274 3768]]


In [34]:
filename = 'models/rd_ngram_ml_model.sav'
pickle.dump(model_forest, open(filename, 'wb'))

#### Logistic Regression

In [35]:
# Start the timer
start_time = datetime.now()

model_logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=0.1)

print("Fitting Logistic Regression to training data....")  
model_logistic_regression = model_logistic_regression.fit(X_train, y_train)

# Stop the timer
end_time = datetime.now()

# Calculate the runtime
runtime = end_time - start_time

# Print the runtime
print('Logistic Regression Training Runtime:', runtime)

Fitting Logistic Regression to training data....
Logistic Regression Training Runtime: 0:00:07.996968


In [36]:
model_logistic_regression.score(X_test, y_test)

0.9035

In [37]:
model_logistic_regression.score(X_train, y_train)

0.9147080885110639

In [38]:
# Start the timer
start_time = datetime.now()

y_pred = model_logistic_regression.predict(X_test)

# Stop the timer
end_time = datetime.now()

# Calculate the runtime
runtime = end_time - start_time

# Print the runtime
print('Logistic Regression Prediction Runtime:', runtime)

Logistic Regression Prediction Runtime: 0:00:00.015573


In [39]:
y_pred = cross_val_predict(model_logistic_regression, X_test, y_test)

In [40]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.87      0.86      0.87      3958
           1       0.87      0.88      0.87      4042

    accuracy                           0.87      8000
   macro avg       0.87      0.87      0.87      8000
weighted avg       0.87      0.87      0.87      8000



In [41]:
print(confusion_matrix(y_test, y_pred))

[[3408  550]
 [ 502 3540]]


In [42]:
filename = 'models/lg_ngram_ml_model.sav'
pickle.dump(model_logistic_regression, open(filename, 'wb'))

#### SVM

In [43]:
# Start the timer
start_time = datetime.now()

model_svm = SVC(C=0.1, kernel='linear')

model_svm = model_svm.fit(X_train, y_train)

# Stop the timer
end_time = datetime.now()

# Calculate the runtime
runtime = end_time - start_time

# Print the runtime
print('SVM Prediction Runtime:', runtime)

SVM Prediction Runtime: 0:23:38.717431


In [44]:
model_svm.score(X_test, y_test)

0.916875

In [45]:
model_svm.score(X_train, y_train)

0.9997187148393549

In [46]:
# Start the timer
start_time = datetime.now()

y_pred = model_svm.predict(X_test)

# Stop the timer
end_time = datetime.now()

# Calculate the runtime
runtime = end_time - start_time

# Print the runtime
print('SVM Prediction Runtime:', runtime)

SVM Prediction Runtime: 0:01:17.650696


In [47]:
y_pred = cross_val_predict(model_svm, X_test, y_test)

In [48]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.89      0.88      3958
           1       0.89      0.88      0.89      4042

    accuracy                           0.88      8000
   macro avg       0.88      0.88      0.88      8000
weighted avg       0.88      0.88      0.88      8000



In [49]:
print(confusion_matrix(y_test, y_pred))

[[3503  455]
 [ 470 3572]]


In [50]:
filename = 'models/svm_ngram_ml_model.sav'
pickle.dump(model_svm, open(filename, 'wb'))

#### Gradient Boosting Classifier

In [51]:
# Start the timer
start_time = datetime.now()


# create a Gradient Boosting classifier with default hyperparameters
gb_classifier = GradientBoostingClassifier(random_state=2)

# fit the classifier to the training data
gb_classifier.fit(X_train, y_train)

# Stop the timer
end_time = datetime.now()

# Calculate the runtime
runtime = end_time - start_time

# Print the runtime
print('Gradient Boosting Prediction Runtime:', runtime)


Gradient Boosting Prediction Runtime: 4:18:16.020259


In [52]:
gb_classifier.score(X_test, y_test)

0.84875

In [53]:
gb_classifier.score(X_train, y_train)

0.853450431303913

In [54]:
# Start the timer
start_time = datetime.now()

y_pred = gb_classifier.predict(X_test)

# Stop the timer
end_time = datetime.now()

# Calculate the runtime
runtime = end_time - start_time

# Print the runtime
print('SVM Prediction Runtime:', runtime)

SVM Prediction Runtime: 0:00:00.046805


In [55]:
y_pred = cross_val_predict(gb_classifier, X_test, y_test)

In [56]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.84      0.84      0.84      3958
           1       0.85      0.84      0.85      4042

    accuracy                           0.84      8000
   macro avg       0.84      0.84      0.84      8000
weighted avg       0.84      0.84      0.84      8000



In [57]:
filename = 'models/gb_boosting_ngram_ml_model.sav'
pickle.dump(gb_classifier, open(filename, 'wb'))