# Imports

In [23]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
import pandas as pd 
import string
import re
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.inspection import permutation_importance
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import LabelEncoder




# Unigram

## Dataset Preprocessing

In [45]:
traindf = pd.read_csv("train_reviews.csv")
trainsentences = traindf["Review"].values
trainlabels = traindf["Label"].values
testdf = pd.read_csv("test_reviews.csv")
testsentences = testdf["Review"].values
testlabels = testdf["Label"].values 

with open("english", "r") as file:
    stop_words = set(file.read().splitlines())
with open("hotel_names.csv", "r") as file:  
    hotel_names = set(file.read().splitlines())


def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text) 
    words = text.split()   
    words = [word for word in words if word not in stop_words and word not in hotel_names]    
    text = ' '.join(words).strip()
    
    return text

train_sentences_preprocessed = [preprocess_text(sentence) for sentence in trainsentences]
test_sentences_preprocessed = [preprocess_text(sentence) for sentence in testsentences]
train_df_preprocessed = pd.DataFrame({'Review': train_sentences_preprocessed,'Label': trainlabels})
test_df_preprocessed = pd.DataFrame({'Review': test_sentences_preprocessed,'Label': testlabels})

In [46]:
trainsentences = train_df_preprocessed["Review"].values
y_train = train_df_preprocessed["Label"].values

testsentences = test_df_preprocessed["Review"].values
y_test = test_df_preprocessed["Label"].values

# Encode labels as integers
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# vectorizing
vectorizer = CountVectorizer(ngram_range=(1,1)) # ngram range for specifying unigrams and bigrams (1,1) - unigram, (2,2) - bigram, (1,2) - both
X_train = vectorizer.fit_transform(trainsentences)
X_test = vectorizer.transform(testsentences)

In [47]:
def evaluate_model(y_pred, y_test):#, exclude_sentiment=False):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='binary')
    recall = recall_score(y_test, y_pred, average='binary')
    f1 = f1_score(y_test, y_pred, average='binary')
    dict={'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1':f1}
    return dict

## Multinomial naive Bayes

### Train and accuracy without feature selection

In [48]:
# fit the model
multinomial_naive_bayes = MultinomialNB(alpha = 1.5, fit_prior = True)
multinomial_naive_bayes.fit(X_train, y_train)
# predictions
y_pred_mb1 = multinomial_naive_bayes.predict(X_test)
dict_mb1 = evaluate_model(y_pred_mb1, y_test)

### Train and accuracy with chi squared test for feature selection

In [49]:
# Perform Chi-squared feature selection
chi2_selector = SelectKBest(chi2, k=2000) #2000 features perform best
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Train a Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_chi2, y_train)

# Predict on the test set
y_pred = nb_model.predict(X_test_chi2)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Get the feature names for the selected features
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_feature_names = vectorizer.get_feature_names_out()[selected_feature_indices]
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Selected Features: {selected_feature_names}")
print(f"Test Set Accuracy: {accuracy}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Selected Features: ['abassador' 'accept' 'accepted' ... 'yunan' 'zone' 'zoo']
Test Set Accuracy: 0.85625
Precision: 0.8608
Recall: 0.8562
F1 Score: 0.8558


In [50]:
# fit the model
multinomial_naive_bayes = MultinomialNB( alpha = 0.1, fit_prior = True)
multinomial_naive_bayes.fit(X_train_chi2, y_train)
# predictions
y_pred_mbc2 = multinomial_naive_bayes.predict(X_test_chi2)
dict_mbc2 = evaluate_model(y_pred_mbc2, y_test) 

## Logistic Regression

### Train and accuracy

In [51]:
# fit the model
logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=5)
logistic_regression.fit(X_train, y_train)
# predictions
y_pred_l1 = logistic_regression.predict(X_test)
dict_l1 = evaluate_model(y_pred_l1, y_test) 

## Classification Trees

### Train and accuracy

In [52]:
# fit the model
decision_tree = DecisionTreeClassifier(ccp_alpha= 0.0, criterion= 'gini', max_depth= 8, min_samples_leaf= 6, min_samples_split= 2, random_state=42)
decision_tree.fit(X_train, y_train)
# predictions
y_pred_ct1 = decision_tree.predict(X_test)
dict_ct1 = evaluate_model(y_pred_ct1, y_test)

## Random Forest

### Train and accuracy

In [53]:
# fit the model
random_forest = RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features="sqrt", min_samples_leaf=2, min_samples_split=5, n_estimators=100, random_state = 42)
random_forest.fit(X_train, y_train)
# predictions
y_pred_rf1 = random_forest.predict(X_test)
dict_rf1 = evaluate_model(y_pred_rf1, y_test)

# Bigram

## Dataset Preprocessing

In [54]:
traindf = pd.read_csv("train_reviews.csv")
trainsentences = traindf["Review"].values
trainlabels = traindf["Label"].values
testdf = pd.read_csv("test_reviews.csv")
testsentences = testdf["Review"].values
testlabels = testdf["Label"].values 

with open("english", "r") as file:
    stop_words = set(file.read().splitlines())


def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text) 
    words = text.split()   
    words = [word for word in words if word not in stop_words]    
    text = ' '.join(words).strip()
    
    return text

train_sentences_preprocessed = [preprocess_text(sentence) for sentence in trainsentences]
test_sentences_preprocessed = [preprocess_text(sentence) for sentence in testsentences]
train_df_preprocessed = pd.DataFrame({'Review': train_sentences_preprocessed,'Label': trainlabels})
test_df_preprocessed = pd.DataFrame({'Review': test_sentences_preprocessed,'Label': testlabels})

In [55]:
trainsentences = train_df_preprocessed["Review"].values
y_train = train_df_preprocessed["Label"].values

testsentences = test_df_preprocessed["Review"].values
y_test = test_df_preprocessed["Label"].values

# Encode labels as integers
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# vectorizing
vectorizer = CountVectorizer(ngram_range=(2,2)) # ngram range for specifying unigrams and bigrams (1,1) - unigram, (2,2) - bigram, (1,2) - both
X_train = vectorizer.fit_transform(trainsentences)
X_test = vectorizer.transform(testsentences)

## Multinomial naive Bayes

### Train and accuracy without feature selection

In [56]:
# fit the model
multinomial_naive_bayes = MultinomialNB( alpha = 0.1, fit_prior = True)
multinomial_naive_bayes.fit(X_train, y_train)
# predictions
y_pred_mb2 = multinomial_naive_bayes.predict(X_test)
dict_mb2 = evaluate_model(y_pred_mb2, y_test)

### Train and accuracy with chi squared test for feature selection

In [57]:
# Perform Chi-squared feature selection
chi2_selector = SelectKBest(chi2, k=2000) #2000 features perform best
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Train a Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_chi2, y_train)

# Predict on the test set
y_pred = nb_model.predict(X_test_chi2)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Get the feature names for the selected features
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_feature_names = vectorizer.get_feature_names_out()[selected_feature_indices]
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Selected Features: {selected_feature_names}")
print(f"Test Set Accuracy: {accuracy}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Selected Features: ['able relax' 'accommodate us' 'across street' ... 'young people'
 'youre going' 'youre traveler']
Test Set Accuracy: 0.725
Precision: 0.7370
Recall: 0.7250
F1 Score: 0.7215


In [58]:
# fit the model
multinomial_naive_bayes = MultinomialNB(alpha = 0.1, fit_prior = True)
multinomial_naive_bayes.fit(X_train_chi2, y_train)
# predictions
y_pred_mbc22 = multinomial_naive_bayes.predict(X_test_chi2)
dict_mbc22 = evaluate_model(y_pred_mbc22, y_test)   

### Feature importance

## Logistic Regression

### Train and accuracy

In [59]:
# fit the model
logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=10)
logistic_regression.fit(X_train, y_train)
# predictions
y_pred_l2 = logistic_regression.predict(X_test)
dict_l2 = evaluate_model(y_pred_l2, y_test)

## Classification Trees

### Train and accuracy

In [60]:
# fit the model
decision_tree = DecisionTreeClassifier(ccp_alpha= 0.0, criterion= 'gini', max_depth= 8, min_samples_leaf= 6, min_samples_split= 2, random_state=42)
decision_tree.fit(X_train, y_train)
# predictions
y_pred_ct2 = decision_tree.predict(X_test)
dict_ct2 = evaluate_model(y_pred_ct2, y_test)   

## Random Forest

### Train and accuracy

In [61]:
# fit the model
random_forest = RandomForestClassifier(criterion = 'entropy', max_depth = None, max_features="sqrt", min_samples_leaf=2, min_samples_split=2, n_estimators=100, random_state = 42)
random_forest.fit(X_train, y_train)
# predictions
y_pred_rf2 = random_forest.predict(X_test)
dict_rf2 = evaluate_model(y_pred_rf2, y_test)   

**Statistical Analysis** 

Using Mcnemar to compare the accuracy


In [62]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from statsmodels.stats.contingency_tables import mcnemar

#let's remember that we have computed the predictions for the three models:
# y_pred_test: single tree
# y_pred_test_bagging: bagging
# y_pred_test_forest: random forest
def contingency_matrics(y_true, pred1, pred2):
    # Contingency Table
    # Here, we compare both model predictions, not against y_true but against each other
    a = np.sum((pred1 == y_true) & (pred2 == y_true))  # Both models correct
    b = np.sum((pred1 == y_true) & (pred2 != y_true))  # Model 1 correct, Model 2 incorrect
    c = np.sum((pred1 != y_true) & (pred2 == y_true))  # Model 1 incorrect, Model 2 correct
    d = np.sum((pred1 != y_true) & (pred2 != y_true))  # Both models incorrect

    # Print the contingency table
    contingency_matrix = np.array([[a, b], [c, d]])
    return contingency_matrix

# Helper function to perform McNemar's test and print the contingency table
def mcnemar_test(contingency_matrix):
    
    # Perform McNemar's test (you could also try exact=True for small samples)
    mcnemar_r = mcnemar(contingency_matrix, exact=False, correction=True)
    chi2 = mcnemar_r.statistic  # Access the test statistic
    p_value = mcnemar_r.pvalue  # Access the p-value
    return chi2, p_value

def single_pairing_test(y_true, pred1, pred2, model_name1, model_name2, n_test):
    """
    Compare three models pairwise using McNemar's test.

    Parameters:
    - y_true: Ground truth labels --> column 'post' from the dataset
    - pred1: Predictions from Model 1 
    - pred2: Predictions from Model 2

    Returns:
    - chi-squared statistics and p-values for pairwise comparisons, including a print statement showing if Hâ‚€ is accepted or rejected.
    """
    cont_m=contingency_matrics(y_true, pred1, pred2)
    chi2, p_value = mcnemar_test(cont_m)
    
    print(f'Contingency Table:\n{cont_m}')
    print(f'Chi-squared: {chi2}')
    print(f'p-value: {p_value}')
    if p_value < 0.05/n_test:
        print("The difference in performance is statistically significant.\nReject the null hypothesis.\n")
        print(f"This means that {model_name1}'s accuracy is significantly different from {model_name2}'s accuracy.\n")
    else:
        print("There's no statistically significant difference in performance.\nAccept the null hypothesis.\n")
        print(f"This means that {model_name1}'s accuracy is no significantly different from {model_name2}'s accuracy.\n")
    
    return cont_m, chi2, p_value



In [63]:
print("Unigram")
print("Evaluation Naive Bayes")
print(dict_mb1)
print("Evaluation Logistic Regression")
print(dict_l1)
print("Evaluation Decision Tree")
print(dict_ct1)
print("Evaluation Random Forest")
print(dict_rf1)
print("Bigram")
print("Evaluation Naive Bayes")
print(dict_mb2)
print("Evaluation Logistic Regression")
print(dict_l2)
print("Evaluation Decision Tree")
print(dict_ct2)
print("Evaluation Random Forest")
print(dict_rf2)


Unigram
Evaluation Naive Bayes
{'accuracy': 0.85, 'precision': np.float64(0.9117647058823529), 'recall': np.float64(0.775), 'f1': np.float64(0.8378378378378378)}
Evaluation Logistic Regression
{'accuracy': 0.775, 'precision': np.float64(0.8055555555555556), 'recall': np.float64(0.725), 'f1': np.float64(0.7631578947368421)}
Evaluation Decision Tree
{'accuracy': 0.625, 'precision': np.float64(0.5943396226415094), 'recall': np.float64(0.7875), 'f1': np.float64(0.6774193548387096)}
Evaluation Random Forest
{'accuracy': 0.8125, 'precision': np.float64(0.7906976744186046), 'recall': np.float64(0.85), 'f1': np.float64(0.8192771084337349)}
Bigram
Evaluation Naive Bayes
{'accuracy': 0.7625, 'precision': np.float64(0.8088235294117647), 'recall': np.float64(0.6875), 'f1': np.float64(0.7432432432432432)}
Evaluation Logistic Regression
{'accuracy': 0.65, 'precision': np.float64(0.6016949152542372), 'recall': np.float64(0.8875), 'f1': np.float64(0.7171717171717171)}
Evaluation Decision Tree
{'accura

**UNIGRAMS**

In [64]:
n_test=8
# Mcnemare test for unigrams
cont_m1, chi2_1, p_value_1 = single_pairing_test(y_test, y_pred_rf1, y_pred_ct1, 'Random Forest', 'Decision Tree', n_test)  
cont_m2, chi2_2, p_value_2 = single_pairing_test(y_test, y_pred_rf1, y_pred_l1, 'Random Forest', 'Logistic Regression', n_test)
cont_mn_test, chi2_n_test, p_value_n_test = single_pairing_test(y_test, y_pred_rf1, y_pred_mb1, 'Random Forest', 'Multinomial Naive Bayes', n_test)
cont_mn_test_1, chi2_n_test_1, p_value_n_test_1 = single_pairing_test(y_test, y_pred_rf1, y_pred_mbc2, 'Random Forest', 'Multinomial Naive Bayes chi_square', n_test)
cont_m4, chi2_4, p_value_4 = single_pairing_test(y_test, y_pred_ct1, y_pred_l1, 'Decision Tree', 'Logistic Regression', n_test)
cont_m5, chi2_5, p_value_5 = single_pairing_test(y_test, y_pred_ct1, y_pred_mb1, 'Decision Tree', 'Multinomial Naive Bayes', n_test)
cont_m5_1, chi2_5_1, p_value_5_1 = single_pairing_test(y_test, y_pred_ct1, y_pred_mbc2, 'Decision Tree', 'Multinomial Naive Bayes chi_square', n_test)
cont_m6, chi2_6, p_value_6 = single_pairing_test(y_test, y_pred_l1, y_pred_mb1, 'Logistic Regression', 'Multinomial Naive Bayes', n_test)
cont_m6_1, chi2_6_1, p_value_6_1 = single_pairing_test(y_test, y_pred_l1, y_pred_mbc2, 'Logistic Regression', 'Multinomial Naive Bayes chi_square', n_test)
cont_m7, chi2_7, p_value_7 = single_pairing_test(y_test, y_pred_mb1, y_pred_mbc2, 'Multinomial Naive Bayes', 'Multinomial Naive Bayes chi_square', n_test)


Contingency Table:
[[91 39]
 [ 9 21]]
Chi-squared: 17.520833333333332
p-value: 2.8417667670336965e-05
The difference in performance is statistically significant.
Reject the null hypothesis.

This means that Random Forest's accuracy is significantly different from Decision Tree's accuracy.

Contingency Table:
[[109  21]
 [ 15  15]]
Chi-squared: 0.6944444444444444
p-value: 0.40465676192728617
There's no statistically significant difference in performance.
Accept the null hypothesis.

This means that Random Forest's accuracy is no significantly different from Logistic Regression's accuracy.

Contingency Table:
[[118  12]
 [ 18  12]]
Chi-squared: 0.8333333333333334
p-value: 0.3613104285261789
There's no statistically significant difference in performance.
Accept the null hypothesis.

This means that Random Forest's accuracy is no significantly different from Multinomial Naive Bayes's accuracy.

Contingency Table:
[[115  15]
 [ 18  12]]
Chi-squared: 0.12121212121212122
p-value: 0.7277235466

**BIGRAMS**

In [65]:
n_test=10
# Mcnemare test for bigrams
cont_m1, chi2_1, p_value_1 = single_pairing_test(y_test, y_pred_rf2, y_pred_ct2, 'Random Forest', 'Decision Tree', n_test)  
cont_m2, chi2_2, p_value_2 = single_pairing_test(y_test, y_pred_rf2, y_pred_l2, 'Random Forest', 'Logistic Regression', n_test)
cont_mn_test, chi2_n_test, p_value_n_test = single_pairing_test(y_test, y_pred_rf2, y_pred_mb2, 'Random Forest', 'Multinomial Naive Bayes', n_test)
cont_mn_test_1, chi2_n_test_1, p_value_n_test_1 = single_pairing_test(y_test, y_pred_rf2, y_pred_mbc22, 'Random Forest', 'Multinomial Naive Bayes chi_square', n_test)
cont_m4, chi2_4, p_value_4 = single_pairing_test(y_test, y_pred_ct2, y_pred_l2, 'Decision Tree', 'Logistic Regression', n_test)
cont_m5, chi2_5, p_value_5 = single_pairing_test(y_test, y_pred_ct2, y_pred_mb2, 'Decision Tree', 'Multinomial Naive Bayes', n_test)
cont_m5_1, chi2_5_1, p_value_5_1 = single_pairing_test(y_test, y_pred_ct2, y_pred_mbc22, 'Decision Tree', 'Multinomial Naive Bayes chi_square', n_test)
cont_m6, chi2_6, p_value_6 = single_pairing_test(y_test, y_pred_l2, y_pred_mb2, 'Logistic Regression', 'Multinomial Naive Bayes', n_test)
cont_m6_1, chi2_6_1, p_value_6_1 = single_pairing_test(y_test, y_pred_l2, y_pred_mbc22, 'Logistic Regression', 'Multinomial Naive Bayes chi_square', n_test)
cont_m7, chi2_7, p_value_7 = single_pairing_test(y_test, y_pred_mb2, y_pred_mbc22, 'Multinomial Naive Bayes', 'Multinomial Naive Bayes chi_square', n_test)


Contingency Table:
[[93 21]
 [ 4 42]]
Chi-squared: 10.24
p-value: 0.0013742758758316976
The difference in performance is statistically significant.
Reject the null hypothesis.

This means that Random Forest's accuracy is significantly different from Decision Tree's accuracy.

Contingency Table:
[[96 18]
 [ 8 38]]
Chi-squared: 3.1153846153846154
p-value: 0.0775561667436654
There's no statistically significant difference in performance.
Accept the null hypothesis.

This means that Random Forest's accuracy is no significantly different from Logistic Regression's accuracy.

Contingency Table:
[[92 22]
 [30 16]]
Chi-squared: 0.9423076923076923
p-value: 0.33168506805685966
There's no statistically significant difference in performance.
Accept the null hypothesis.

This means that Random Forest's accuracy is no significantly different from Multinomial Naive Bayes's accuracy.

Contingency Table:
[[83 31]
 [27 19]]
Chi-squared: 0.15517241379310345
p-value: 0.6936406217837585
There's no statisti