# Imports

In [74]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
import pandas as pd 
import string
import re
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.inspection import permutation_importance


# Unigram

## Dataset Preprocessing

In [75]:
traindf = pd.read_csv("train_reviews.csv")
trainsentences = traindf["Review"].values
trainlabels = traindf["Label"].values
testdf = pd.read_csv("test_reviews.csv")
testsentences = testdf["Review"].values
testlabels = testdf["Label"].values 

with open("refined_english_stopwords.txt", "r") as file:
    stop_words = set(file.read().splitlines())


def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text) 
    words = text.split()   
    words = [word for word in words if word not in stop_words]    
    text = ' '.join(words).strip()
    
    return text

train_sentences_preprocessed = [preprocess_text(sentence) for sentence in trainsentences]
test_sentences_preprocessed = [preprocess_text(sentence) for sentence in testsentences]
train_df_preprocessed = pd.DataFrame({'Review': train_sentences_preprocessed,'Label': trainlabels})
test_df_preprocessed = pd.DataFrame({'Review': test_sentences_preprocessed,'Label': testlabels})

In [76]:
trainsentences = train_df_preprocessed["Review"].values
y_train = train_df_preprocessed["Label"].values

testsentences = test_df_preprocessed["Review"].values
y_test = test_df_preprocessed["Label"].values

# vectorizing
vectorizer = CountVectorizer(ngram_range=(1,1))
 # ngram range for specifying unigrams and bigrams (1,1) - unigram, (2,2) - bigram, (1,2) - both
X_train = vectorizer.fit_transform(trainsentences)
X_test = vectorizer.transform(testsentences)


In [77]:
def evaluate_model(y_pred, y_test):#, exclude_sentiment=False):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    dict={'accuracy':accuracy, 'precision':precision, 'recall':recall, 'f1':f1}
    return dict

In [78]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from statsmodels.stats.contingency_tables import mcnemar

#let's remember that we have computed the predictions for the four models:
# multinomial_naive_bayes, logistic_regression, decision_tree, random_forest

def contingency_matrics(y_true, pred1, pred2):
    # Contingency Table
    # Here, we compare both model predictions, not against y_true but against each other
    a = np.sum((pred1 == y_true) & (pred2 == y_true))  # Both models correct
    b = np.sum((pred1 == y_true) & (pred2 != y_true))  # Model 1 correct, Model 2 incorrect
    c = np.sum((pred1 != y_true) & (pred2 == y_true))  # Model 1 incorrect, Model 2 correct
    d = np.sum((pred1 != y_true) & (pred2 != y_true))  # Both models incorrect

    # Print the contingency table
    contingency_matrix = np.array([[a, b], [c, d]])
    return contingency_matrix

# Helper function to perform McNemar's test and print the contingency table
def mcnemar_test(contingency_matrix):
    
    # Perform McNemar's test (you could also try exact=True for small samples)
    mcnemar_r = mcnemar(contingency_matrix, exact=False, correction=True)
    chi2 = mcnemar_r.statistic  # Access the test statistic
    p_value = mcnemar_r.pvalue  # Access the p-value
    return chi2, p_value

def single_pairing_test(y_true, pred1, pred2, model_name1, model_name2):
    """
    Compare three models pairwise using McNemar's test.

    Parameters:
    - y_true: Ground truth labels --> column 'post' from the dataset
    - pred1: Predictions from Model 1 
    - pred2: Predictions from Model 2

    Returns:
    - chi-squared statistics and p-values for pairwise comparisons, including a print statement showing if H₀ is accepted or rejected.
    """
    cont_m=contingency_matrics(y_true, pred1, pred2)
    chi2, p_value = mcnemar_test(cont_m)
    print(f'Contingency Table:\n{cont_m}')
    print(f'Chi-squared: {chi2}')
    print(f'p-value: {p_value}')
    if p_value < (0.05/4):
        print("The difference in performance is statistically significant.\nReject the null hypothesis.\n")
        print(f"This means that {model_name1}'s accuracy is significantly different from {model_name2}'s accuracy.")
    else:
        print("There's no statistically significant difference in performance.\nAccept the null hypothesis.\n")
        print(f"This means that {model_name1}'s accuracy is no significantly different from {model_name2}'s accuracy.")
    
    return cont_m, chi2, p_value

## Multinomial naive Bayes

### Chi squared test for feature selection

In [79]:
# Perform Chi-squared feature selection
chi2_selector = SelectKBest(chi2, k=2000) #2000 features perform best
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
#print(X_train_chi2.shape)
X_test_chi2 = chi2_selector.transform(X_test)

# Train a Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_chi2, y_train)

# Predict on the test set
y_pred_chi2 = nb_model.predict(X_test_chi2)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred_chi2)

# Get the feature names for the selected features
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_feature_names = vectorizer.get_feature_names_out()[selected_feature_indices]

print(f"Selected Features: {selected_feature_names}")
print(f"Test Set Accuracy: {accuracy}")

Selected Features: ['abassador' 'accept' 'accepted' ... 'yunan' 'zone' 'zoo']
Test Set Accuracy: 0.86875


### Train and accuracy

In [80]:
# fit the model
multinomial_naive_bayes = MultinomialNB()
multinomial_naive_bayes.fit(X_train, y_train)
# predictions
y_pred_bayes = multinomial_naive_bayes.predict(X_test)
dict_bayes=evaluate_model(y_pred_bayes, y_test)
print(dict_bayes)

{'accuracy': 0.86875, 'precision': 0.8734768159518911, 'recall': 0.86875, 'f1': 0.8683333986441475}


### Feature importance

In [81]:
# Get log probabilities
log_probs = multinomial_naive_bayes.feature_log_prob_

feature_names = vectorizer.get_feature_names_out()
print(len(log_probs[0]))
# Create DataFrames for each class
deceptive_features = pd.DataFrame({'Feature': feature_names, 'Log Prob': log_probs[1]})
genuine_features = pd.DataFrame({'Feature': feature_names, 'Log Prob': log_probs[0]})

# Sort by log probabilities
deceptive_features = deceptive_features.sort_values(by='Log Prob', ascending=False)
genuine_features = genuine_features.sort_values(by='Log Prob', ascending=False)

# Print the top 10 features for each class
print("Top features for Deceptive Reviews:")
print(deceptive_features.head())

print("\nTop features for Genuine Reviews:")
print(genuine_features.head())

6967
Top features for Deceptive Reviews:
     Feature  Log Prob
6758      we -3.908500
5141    room -4.044659
2955   hotel -4.122824
4078     not -4.218297
6170    they -4.402409

Top features for Genuine Reviews:
      Feature  Log Prob
5141     room -3.832078
2955    hotel -3.911186
6758       we -3.919316
4078      not -4.107368
1020  chicago -4.597608


### Cross validation

In [82]:
scores = cross_val_score(multinomial_naive_bayes, X_test, y_test, cv=10, scoring = "accuracy") #try f1_macro
print("accuracy for splits: {}".format(scores))
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

accuracy for splits: [1.     0.875  0.875  0.75   0.8125 0.875  0.8125 0.875  0.875  0.875 ]
0.86 accuracy with a standard deviation of 0.06


### Gridsearch

In [83]:
# parameters
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # Laplace smoothing parameter
    'fit_prior': [True, False]
}
multinomial_naive_bayes = MultinomialNB()

# Set up GridSearchCV
grid_search = GridSearchCV(multinomial_naive_bayes, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'alpha': 1.0, 'fit_prior': True}
Best Cross-Validation Score: 0.846875
Test Set Score: 0.86875


## Logistic Regression

### Train and accuracy

In [84]:
# fit the model
logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)
logistic_regression.fit(X_train, y_train)
# predictions
y_pred_log = logistic_regression.predict(X_test)
dict_log=evaluate_model(y_pred_log, y_test)
print(dict_log)

{'accuracy': 0.80625, 'precision': 0.8074509803921568, 'recall': 0.80625, 'f1': 0.8060606060606061}


### Feature importance

In [85]:
# Get the coefficients
coefficients = logistic_regression.coef_[0]
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame for better visualization
coef_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})

# Separate features for genuine and deceptive reviews
genuine_features = coef_df[coef_df['coefficient'] > 0].sort_values(by='coefficient', ascending=False)
deceptive_features = coef_df[coef_df['coefficient'] < 0].sort_values(by='coefficient')

# Display the results
print("Important Features for Genuine Reviews:")
print(genuine_features.head())

print("\nImportant Features for Deceptive Reviews:")
print(deceptive_features.head())

Important Features for Genuine Reviews:
         feature  coefficient
5783        star     1.949962
6897       world     1.375677
5876      street     1.318635
6382         try     1.156914
1271  conference     1.129812

Important Features for Deceptive Reviews:
      feature  coefficient
4624   prices    -1.744230
2327  finally    -1.699424
1566  decided    -1.381841
1020  chicago    -1.325233
3622   luxury    -1.235582


### Gridsearch

In [86]:
# parameters
param_grid = {
    'C': [0.001, 0.01, 0.1, 1.0, 10]
}

logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)

# Set up GridSearchCV
grid_search = GridSearchCV(logistic_regression, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'C': 10}
Best Cross-Validation Score: 0.803125
Test Set Score: 0.76875


## Classification Trees

### Train and accuracy

In [87]:
# fit the model
decision_tree = DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42)
decision_tree.fit(X_train, y_train)
# predictions
y_pred_ct = decision_tree.predict(X_test)
dict_ct=evaluate_model(y_pred_ct, y_test)
print(dict_ct)

{'accuracy': 0.625, 'precision': 0.626984126984127, 'recall': 0.625, 'f1': 0.6235294117647059}


### Feature Importance

In [88]:
# Get feature importances
importances = decision_tree.feature_importances_
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Get the top 5 important features for genuine reviews
top_5_genuine_review_terms = importance_df.head(5)  # or use .head(5) based on your label encoding
print(top_5_genuine_review_terms)

# Get the top 5 important features for deceptive reviews
top_5_genuine_review_terms = importance_df.tail(5)  # or use .head(5) based on your label encoding
print("\n", top_5_genuine_review_terms)

      feature  importance
1020  chicago    0.192117
1358     cool    0.031134
5418   sheets    0.031132
5839    still    0.028451
3676     many    0.026788

        feature  importance
2333    finger         0.0
2332    finest         0.0
2331  finejust         0.0
2330      fine         0.0
6966       zoo         0.0


In [89]:
# Convert the sparse matrix X_test to a dense format
X_test_dense = X_test.toarray()
perm_importance = permutation_importance(decision_tree, X_test_dense, y_test, n_repeats=10, random_state=42)

feature_names = vectorizer.get_feature_names_out()
# Display the importance scores for the positive class
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)

print("\nPermutation Importance genuine:")
print(importance_df.head())

print("\nPermutation Importance deceptive:")
print(importance_df.tail())


Permutation Importance genuine:
       Feature  Importance
1020   chicago    0.081250
3676      many    0.018750
3794  michigan    0.015000
1358      cool    0.011875
6862    within    0.010625

Permutation Importance deceptive:
     Feature  Importance
4025   night   -0.004375
4049     non   -0.006875
1468  crumbs   -0.007500
348   arrive   -0.011250
4358     pay   -0.014375


: 

### Gridsearch

In [None]:
# parameters
param_grid = {
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 2, 4, 6, 8, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 6],  # Minimum number of samples required to be at a leaf node
    'ccp_alpha': np.linspace(0, 0.1, 11) 
}


decision_tree = DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(decision_tree, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

## Random Forest

### Train and accuracy

In [223]:
# fit the model
random_forest = RandomForestClassifier(criterion = 'gini', max_depth = None, random_state = 42)
random_forest.fit(X_train, y_train)
# predictions
y_pred_rf = random_forest.predict(X_test)
dict_rf=evaluate_model(y_pred_rf, y_test)

Accuracy: 0.8000
Precision: 0.8000
Recall: 0.8000
F1 Score: 0.8000


### Feature Importance

In [206]:
# Get feature importances
importances = random_forest.feature_importances_
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Get the top 5 important features for genuine reviews
top_5_genuine_review_terms = importance_df.head(5)  # or use .head(5) based on your label encoding
print(top_5_genuine_review_terms)

         feature  importance
1020     chicago    0.030663
3544    location    0.010557
5585       smell    0.007676
2180  experience    0.007068
2677       great    0.006658


In [None]:
# X_train = X_train.toarray() 
# X_test = X_test.toarray()

In [221]:
perm_importance = permutation_importance(random_forest, X_test, y_test, n_repeats=10, random_state=42)

feature_names = vectorizer.get_feature_names_out()
# Display the importance scores for the positive class
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)

print("\nPermutation Importance genuine:")
print(importance_df.head())

print("\nPermutation Importance deceptive:")
print(importance_df.tail())


Permutation Importance genuine:
       Feature  Importance
2005  elevator     0.02375
1020   chicago     0.02250
4617    prices     0.02000
3544  location     0.01625
5580     small     0.01375

Permutation Importance deceptive:
      Feature  Importance
3485     like    -0.01125
349   arrived    -0.01125
6319   travel    -0.01250
4372   people    -0.01250
2501    front    -0.01500


### Gridsearch

In [16]:
# parameters
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_features': [ 'sqrt', 'log2'],  # Number of features to consider for the best split
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],  # Minimum number of samples required to be at a leaf node
}

random_forest = RandomForestClassifier(criterion = 'gini', max_depth = None, random_state = 42)

# Set up GridSearchCV
grid_search = GridSearchCV(random_forest, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Best Cross-Validation Score: 0.8359375
Test Set Score: 0.8125


### OOB score

In [54]:
from sklearn.model_selection import ParameterGrid

# parameters
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_features': [ 'sqrt', 'log2'],  # Number of features to consider for the best split
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],  # Minimum number of samples required to be at a leaf node
    'ccp_alpha': np.linspace(0, 0.1, 11) 
}


# Store results
results = []

# Loop through hyperparameter combinations
for params in ParameterGrid(param_grid):
    model = RandomForestClassifier(oob_score=True, random_state=42, **params)
    model.fit(X_train, y_train)
    
    # Record OOB score
    oob_score = model.oob_score_
    results.append({**params, 'oob_score': oob_score})

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Find the best hyperparameters based on OOB score
best_params = results_df.loc[results_df['oob_score'].idxmax()]
print("Best Hyperparameters:\n", best_params)

Best Hyperparameters:
 max_depth               10.0
max_features            sqrt
min_samples_leaf           2
min_samples_split          5
n_estimators             100
oob_score            0.83125
Name: 23, dtype: object


# Bigram

## Dataset Preprocessing

In [17]:
traindf = pd.read_csv("train_reviews.csv")
trainsentences = traindf["Review"].values
trainlabels = traindf["Label"].values
testdf = pd.read_csv("test_reviews.csv")
testsentences = testdf["Review"].values
testlabels = testdf["Label"].values 

with open("english", "r") as file:
    stop_words = set(file.read().splitlines())


def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text) 
    words = text.split()   
    words = [word for word in words if word not in stop_words]    
    text = ' '.join(words).strip()
    
    return text

train_sentences_preprocessed = [preprocess_text(sentence) for sentence in trainsentences]
test_sentences_preprocessed = [preprocess_text(sentence) for sentence in testsentences]
train_df_preprocessed = pd.DataFrame({'Review': train_sentences_preprocessed,'Label': trainlabels})
test_df_preprocessed = pd.DataFrame({'Review': test_sentences_preprocessed,'Label': testlabels})

In [18]:
trainsentences = train_df_preprocessed["Review"].values
y_train = train_df_preprocessed["Label"].values

testsentences = test_df_preprocessed["Review"].values
y_test = test_df_preprocessed["Label"].values

# vectorizing
vectorizer = CountVectorizer(ngram_range=(2,2)) # ngram range for specifying unigrams and bigrams (1,1) - unigram, (2,2) - bigram, (1,2) - both
X_train = vectorizer.fit_transform(trainsentences)
X_test = vectorizer.transform(testsentences)

In [19]:
for i in y_test:
    if i != "Negative Deceptive" and i != "Negative Truthful":
        print(i)

## Multinomial naive Bayes

### Train and accuracy

In [20]:
# fit the model
multinomial_naive_bayes = MultinomialNB()
multinomial_naive_bayes.fit(X_train, y_train)
# predictions
y_pred = multinomial_naive_bayes.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.7688
Precision: 0.7785
Recall: 0.7688
F1 Score: 0.7667


### Cross validation

In [21]:
scores = cross_val_score(multinomial_naive_bayes, X_test, y_test, cv=10, scoring = "accuracy") #try f1_macro
print("accuracy for splits: {}".format(scores))
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))

accuracy for splits: [0.8125 0.5625 0.6875 0.5625 0.625  0.625  0.875  0.5625 0.6875 0.8125]
0.68 accuracy with a standard deviation of 0.11


### Gridsearch

In [22]:
# parameters
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # Laplace smoothing parameter
    'fit_prior': [True, False]
}
multinomial_naive_bayes = MultinomialNB()

# Set up GridSearchCV
grid_search = GridSearchCV(multinomial_naive_bayes, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'alpha': 0.1, 'fit_prior': True}
Best Cross-Validation Score: 0.7515625
Test Set Score: 0.7625


## Logistic Regression

### Train and accuracy

In [23]:
# fit the model
logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)
logistic_regression.fit(X_train, y_train)
# predictions
y_pred = logistic_regression.predict(X_test)
accuracy = accuracy_score(y_test ,y_pred)
precision = precision_score(y_test ,y_pred,average='weighted')
recall = recall_score(y_test ,y_pred,average='weighted')
f1 = f1_score(y_test, y_pred,average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6625
Precision: 0.7098
Recall: 0.6625
F1 Score: 0.6423


### Feature importance

In [24]:
# Get the coefficients
coefficients = logistic_regression.coef_[0]
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame for better visualization
coef_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})

# Separate features for genuine and deceptive reviews
genuine_features = coef_df[coef_df['coefficient'] > 0].sort_values(by='coefficient', ascending=False)
deceptive_features = coef_df[coef_df['coefficient'] < 0].sort_values(by='coefficient')

# Display the results
print("Important Features for Genuine Reviews:")
print(genuine_features)

print("\nImportant Features for Deceptive Reviews:")
print(deceptive_features)

Important Features for Genuine Reviews:
               feature  coefficient
36694         th floor     2.482137
4121      booked hotel     1.818765
22268       many times     1.546209
21175   location worth     1.228648
23457      much better     1.122411
...                ...          ...
22699  michigan avenue     0.030777
17202        hot water     0.021508
11003      duvet cover     0.016028
25446        one night     0.007881
17687       hotel room     0.004241

[72 rows x 2 columns]

Important Features for Deceptive Reviews:
                  feature   coefficient
6315   chicago millennium -3.247623e+00
11070          east hotel -2.298146e+00
21711        luxury hotel -2.260378e+00
17292       hotel chicago -2.242835e+00
7896       conrad chicago -2.175113e+00
...                   ...           ...
9534           desk clerk -3.280306e-02
40527         waste money -2.084544e-02
2690            back room -1.165412e-02
17498          hotel last -4.714746e-03
31953          seem pa

### Gridsearch

In [25]:
# parameters
param_grid = {
    'C': [0.001, 0.01, 0.1, 1.0, 10]
}

logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=1.0)

# Set up GridSearchCV
grid_search = GridSearchCV(logistic_regression, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'C': 10}
Best Cross-Validation Score: 0.7390625
Test Set Score: 0.65


## Classification Trees

### Train and accuracy

In [26]:
# fit the model
decision_tree = DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42)
decision_tree.fit(X_train, y_train)
# predictions
y_pred = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6500
Precision: 0.6881
Recall: 0.6500
F1 Score: 0.6313


### Feature Importance

In [27]:
# Get feature importances
importances = decision_tree.feature_importances_
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Get the top 5 important features for genuine reviews
top_5_genuine_review_terms = importance_df.head(5)  # or use .head(5) based on your label encoding
print(top_5_genuine_review_terms)

# Get the top 5 important features for deceptive reviews
top_5_genuine_review_terms = importance_df.tail(5)  # or use .head(5) based on your label encoding
print("\n", top_5_genuine_review_terms)

                  feature  importance
6264        chicago hotel    0.052528
17292       hotel chicago    0.052329
6315   chicago millennium    0.035386
21711        luxury hotel    0.028640
11070          east hotel    0.025242

                 feature  importance
14159     forward using         0.0
14160  forward vacation         0.0
14161      forward view         0.0
14162      forward wifi         0.0
42435        zoo second         0.0


### Gridsearch

In [28]:
# parameters
param_grid = {
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 2, 4, 6, 8, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 6]  # Minimum number of samples required to be at a leaf node
}


decision_tree = DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(decision_tree, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'criterion': 'entropy', 'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Cross-Validation Score: 0.7171875
Test Set Score: 0.6375


## Random Forest

### Train and accuracy

In [29]:
# fit the model
random_forest = RandomForestClassifier(criterion = 'gini', max_depth = None, random_state = 42)
random_forest.fit(X_train, y_train)
# predictions
y_pred = random_forest.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'weighted')
recall = recall_score(y_test, y_pred, average = 'weighted')
f1 = f1_score(y_test, y_pred,average = 'weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.5938
Precision: 0.7055
Recall: 0.5938
F1 Score: 0.5298


### Feature Importance

In [30]:
# Get feature importances
importances = random_forest.feature_importances_
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame for better visualization
importance_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Get the top 5 important features for genuine reviews
top_5_genuine_review_terms = importance_df.head(5)  # or use .head(5) based on your label encoding
print(top_5_genuine_review_terms)

                feature  importance
17292     hotel chicago    0.011261
6264      chicago hotel    0.010452
29003   recently stayed    0.008170
32746  sheraton chicago    0.004932
7896     conrad chicago    0.004806


### Gridsearch

In [31]:
# parameters
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_features': [ 'sqrt', 'log2'],  # Number of features to consider for the best split
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2]  # Minimum number of samples required to be at a leaf node
}

random_forest = RandomForestClassifier(criterion = 'gini', max_depth = None, random_state = 42)

# Set up GridSearchCV
grid_search = GridSearchCV(random_forest, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 5, 'n_estimators': 100}
Best Cross-Validation Score: 0.7625
Test Set Score: 0.68125


Statystical Analysis