# Imports

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
import pandas as pd 
import string
import re
import numpy as np
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.inspection import permutation_importance
from sklearn.model_selection import ParameterGrid
from sklearn.preprocessing import LabelEncoder
from scipy.stats import lognorm
import string
import re
from sklearn.preprocessing import LabelEncoder


# Unigram

## Dataset Preprocessing

In [2]:
traindf = pd.read_csv("train_reviews.csv")
trainsentences = traindf["Review"].values
trainlabels = traindf["Label"].values
test_df = pd.read_csv("test_reviews.csv")
testsentences = test_df["Review"].values
testlabels = test_df["Label"].values 

with open("english", "r") as file:
    stop_words = set(file.read().splitlines())


def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text) 
    words = text.split()   
    words = [word for word in words if word not in stop_words]    
    text = ' '.join(words).strip()
    
    return text

train_sentences_preprocessed = [preprocess_text(sentence) for sentence in trainsentences]
test_sentences_preprocessed = [preprocess_text(sentence) for sentence in testsentences]
train_df_preprocessed = pd.DataFrame({'Review': train_sentences_preprocessed,'Label': trainlabels})
test_df_preprocessed = pd.DataFrame({'Review': test_sentences_preprocessed,'Label': testlabels})

In [3]:
trainsentences = train_df_preprocessed["Review"].values
y_train = train_df_preprocessed["Label"].values

testsentences = test_df_preprocessed["Review"].values
y_test = test_df_preprocessed["Label"].values

# Encode labels as integers
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# vectorizing
vectorizer = CountVectorizer(ngram_range=(1,1)) # ngram range for specifying unigrams and bigrams (1,1) - unigram, (2,2) - bigram, (1,2) - both
X_train = vectorizer.fit_transform(trainsentences)
X_test = vectorizer.transform(testsentences)

## Multinomial naive Bayes

### Train and accuracy without feature selection

In [22]:
# parameters
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # Laplace smoothing parameter
    'fit_prior': [True, False]
}
multinomial_naive_bayes = MultinomialNB()

# Set up GridSearchCV
grid_search = GridSearchCV(multinomial_naive_bayes, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'alpha': 1.5, 'fit_prior': True}
Best Cross-Validation Score: 0.834375
Test Set Score: 0.88125


In [4]:
# fit the model
multinomial_naive_bayes = MultinomialNB( alpha = 1.5, fit_prior = True)
multinomial_naive_bayes.fit(X_train, y_train)
# predictions
y_pred = multinomial_naive_bayes.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8812
Precision: 0.9296
Recall: 0.8250
F1 Score: 0.8742


### Train and accuracy with chi squared test for feature selection

In [6]:
# Perform Chi-squared feature selection
chi2_selector = SelectKBest(chi2, k=2000) #2000 features perform best
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Train a Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_chi2, y_train)

# Predict on the test set
y_pred = nb_model.predict(X_test_chi2)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Get the feature names for the selected features
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_feature_names = vectorizer.get_feature_names_out()[selected_feature_indices]
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Selected Features: {selected_feature_names}")
print(f"Test Set Accuracy: {accuracy}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Selected Features: ['abassador' 'accept' 'accepted' ... 'yunan' 'zone' 'zoo']
Test Set Accuracy: 0.8875
Precision: 0.8885
Recall: 0.8875
F1 Score: 0.8874


In [25]:
# parameters
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # Laplace smoothing parameter
    'fit_prior': [True, False]
}
multinomial_naive_bayes = MultinomialNB()

# Set up GridSearchCV
grid_search = GridSearchCV(multinomial_naive_bayes, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_chi2, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test_chi2, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'alpha': 0.1, 'fit_prior': True}
Best Cross-Validation Score: 0.93125
Test Set Score: 0.825


In [7]:
# fit the model
multinomial_naive_bayes = MultinomialNB( alpha = 0.1, fit_prior = True)
multinomial_naive_bayes.fit(X_train_chi2, y_train)
# predictions
y_pred = multinomial_naive_bayes.predict(X_test_chi2)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8250
Precision: 0.8252
Recall: 0.8250
F1 Score: 0.8250


### Feature importance

In [27]:
# Get log probabilities
log_probs = multinomial_naive_bayes.feature_log_prob_

feature_names = chi2_selector.get_feature_names_out()

# Calculate log-odds ratio (Class 1 log-probs minus Class 0 log-probs)
log_odds = log_probs[1, :] - log_probs[0, :]  # Class 1 (Genuine) vs Class 0 (Deceptive)

# Convert to DataFrame
log_odds_df = pd.DataFrame(log_odds, index=feature_names, columns=['log_odds'])

# Sort by log-odds
deceptive_features = log_odds_df.sort_values(by='log_odds', ascending=True).head()  # More negative log-odds -> Deceptive
genuine_features = log_odds_df.sort_values(by='log_odds', ascending=False).head()  # More positive log-odds -> Genuine

# Print the top features for each class based on log-odds
print("Top features for Deceptive Reviews:")
print(deceptive_features)

print("\nTop features for Genuine Reviews:")
print(genuine_features)

Top features for Deceptive Reviews:
       log_odds
x4918 -4.966775
x4211 -4.813806
x5375 -4.633136
x1991 -4.528875
x5811 -4.412464

Top features for Genuine Reviews:
       log_odds
x4614  5.234258
x5633  4.857182
x2489  4.777775
x5794  4.777775
x6204  4.492844


## Logistic Regression

### Train and accuracy

In [28]:
'''To maintain a comparable structure WITHIN LOGISTIC REGRESSION truthful reviews sampled to fit a log-normal distribution 
aligned with the lengths of the deceptive reviews, as truthful reviews are generally longer on average. 

Makes sense because Lasso penalizes less informative features, focusing the model on discriminative words or phrases 
instead of review length. By aligning lengths, we reduce the potential noise from length variations. 
This approach also preserves the interpretability of logistic regression, making it easier to understand which words 
contribute most to classification.'''

# Separate reviews based on labels (assuming 0 = negative, 1 = positive)
negative_reviews_train = traindf[traindf['Label'] == 0]
positive_reviews_train = traindf[traindf['Label'] == 1]

# Calculate lengths of deceptive reviews and fit log-normal distribution
negative_lengths = negative_reviews_train['Review'].str.len()
shape, loc, scale = lognorm.fit(negative_lengths, floc=0)
target_lengths = lognorm.rvs(shape, loc=loc, scale=scale, size=len(negative_lengths)) # Generate target lengths from log-normal distribution
truthful_reviews_train = positive_reviews_train
truthful_lengths = truthful_reviews_train['Review'].str.len()
tolerance = 10
sampled_truthful_reviews = truthful_reviews_train[np.isclose(truthful_lengths, target_lengths[:, None], atol=tolerance).any(axis=0)]

# Combine sampled truthful and deceptive reviews for a structured dataset
traindf_log= pd.concat([sampled_truthful_reviews, negative_reviews_train])

train_sentences_preprocessed_loh = [preprocess_text(sentence) for sentence in trainsentences]
train_df_preprocessed_log = pd.DataFrame({'Review': train_sentences_preprocessed,'Label': trainlabels})
trainsentences_log = train_df_preprocessed_log["Review"].values
y_train_log = train_df_preprocessed_log["Label"].values

# Encode labels as integers
le = LabelEncoder()
y_train_log = le.fit_transform(y_train)
# vectorizing
vectorizer = CountVectorizer(ngram_range=(1,1)) # ngram range for specifying unigrams and bigrams (1,1) - unigram, (2,2) - bigram, (1,2) - both
X_train_log = vectorizer.fit_transform(trainsentences_log)


In [29]:
# parameters
param_grid = {
    'C': [0.001, 0.01, 0.1, 1.0, 5, 10]
}

logistic_regression = LogisticRegression(penalty='l1', solver='liblinear')

# Set up GridSearchCV
grid_search = GridSearchCV(logistic_regression, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train_log, y_train_log)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'C': 10}
Best Cross-Validation Score: 0.8078125
Test Set Score: 0.80625


In [30]:
# fit the model
logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=5)
logistic_regression.fit(X_train_log, y_train_log)
# predictions
y_pred = logistic_regression.predict(X_test)
accuracy = accuracy_score(y_test ,y_pred)
precision = precision_score(y_test ,y_pred,average='binary')
recall = recall_score(y_test ,y_pred,average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.8125
Precision: 0.8125
Recall: 0.8125
F1 Score: 0.8125


In [31]:
# Check number of non-zero coefficients
non_zero_coefficients = (logistic_regression.coef_ != 0).sum()
total_coefficients = logistic_regression.coef_.size
print(f"Non-zero coefficients: {non_zero_coefficients}/{total_coefficients}")


Non-zero coefficients: 251/6955


### Feature importance

In [32]:
# Get the coefficients
coefficients = logistic_regression.coef_[0]
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame for better visualization
coef_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})

# Separate features for genuine and deceptive reviews
genuine_features = coef_df[coef_df['coefficient'] > 0].sort_values(by='coefficient', ascending=False)
deceptive_features = coef_df[coef_df['coefficient'] < 0].sort_values(by='coefficient')

# Display the results
print("Important Features for Genuine Reviews:")
print(genuine_features.head())

print("\nImportant Features for Deceptive Reviews:")
print(deceptive_features.head())

Important Features for Genuine Reviews:
        feature  coefficient
5775       star     2.898416
6886      world     2.892918
6372        try     2.176058
1255  concierge     1.976754
878        cant     1.921111

Important Features for Deceptive Reviews:
        feature  coefficient
2327    finally    -2.419876
4836     recent    -2.414152
4617     prices    -2.284165
4918      relax    -2.163613
3992  neighbors    -2.079103


## Classification Trees

### Train and accuracy

In [None]:
# parameters
param_grid = {
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 2, 4, 6, 8, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 6],  # Minimum number of samples required to be at a leaf node
    'ccp_alpha': np.linspace(0, 0.1, 11) 
}


decision_tree = DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(decision_tree, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

In [16]:
# fit the model
decision_tree = DecisionTreeClassifier(ccp_alpha= 0.0, criterion= 'gini', max_depth= 8, min_samples_leaf= 6, min_samples_split= 2, random_state=42)
decision_tree.fit(X_train, y_train)
# predictions
y_pred = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.5875
Precision: 0.5921
Recall: 0.5625
F1 Score: 0.5769


### Feature Importance

In [17]:
X_train_arr = X_train.toarray() 
X_test_arr = X_test.toarray()

perm_importance = permutation_importance(decision_tree, X_test_arr, y_test, n_repeats=10, random_state=42)

feature_names = vectorizer.get_feature_names_out()
# Display the importance scores for the positive class
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)

print("\nPermutation Importance:")
print(importance_df.head())

KeyboardInterrupt: 

## Random Forest

### Train and accuracy

In [4]:

# parameters
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_features': [ 'sqrt', 'log2'],  # Number of features to consider for the best split
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2]  # Minimum number of samples required to be at a leaf node
}


# Store results
results = []

# Loop through hyperparameter combinations
for params in ParameterGrid(param_grid):
    model = RandomForestClassifier(oob_score=True, random_state=42, **params)
    model.fit(X_train, y_train)
    
    # Record OOB score
    oob_score = model.oob_score_
    results.append({**params, 'oob_score': oob_score})

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Find the best hyperparameters based on OOB score
best_params = results_df.loc[results_df['oob_score'].idxmax()]
print("Best Hyperparameters:\n", best_params)

Best Hyperparameters:
 criterion             entropy
max_depth                20.0
max_features             sqrt
min_samples_leaf            2
min_samples_split           5
n_estimators              100
oob_score            0.834375
Name: 87, dtype: object


In [18]:
# fit the model
random_forest = RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features="sqrt", min_samples_leaf=2, min_samples_split=5, n_estimators=100, random_state = 42)
random_forest.fit(X_train, y_train)
# predictions
y_pred = random_forest.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'binary')
recall = recall_score(y_test, y_pred, average = 'binary')
f1 = f1_score(y_test, y_pred,average = 'binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.7812
Precision: 0.8462
Recall: 0.6875
F1 Score: 0.7586


### Feature Importance

In [None]:
perm_importance = permutation_importance(random_forest, X_test_arr, y_test, n_repeats=10, random_state=42)

feature_names = vectorizer.get_feature_names_out()
# Display the importance scores for the positive class
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)

print("\nPermutation Importance:")
print(importance_df.head())


Permutation Importance genuine:
       Feature  Importance
1020   chicago    0.051250
3544  location    0.024375
1093   cleaned    0.016250
5585     smell    0.016250
5320    seemed    0.014375

Permutation Importance deceptive:
      Feature  Importance
2220     face   -0.006875
774    broken   -0.006875
3647     make   -0.008750
349   arrived   -0.009375
2457    found   -0.013125


# Bigram

## Dataset Preprocessing

In [129]:
traindf = pd.read_csv("train_reviews.csv")
trainsentences = traindf["Review"].values
trainlabels = traindf["Label"].values
testdf = pd.read_csv("test_reviews.csv")
testsentences = testdf["Review"].values
testlabels = testdf["Label"].values 

with open("english", "r") as file:
    stop_words = set(file.read().splitlines())


def preprocess_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()
    text = re.sub(r'\d+', '', text) 
    words = text.split()   
    words = [word for word in words if word not in stop_words]    
    text = ' '.join(words).strip()
    
    return text

train_sentences_preprocessed = [preprocess_text(sentence) for sentence in trainsentences]
test_sentences_preprocessed = [preprocess_text(sentence) for sentence in testsentences]
train_df_preprocessed = pd.DataFrame({'Review': train_sentences_preprocessed,'Label': trainlabels})
test_df_preprocessed = pd.DataFrame({'Review': test_sentences_preprocessed,'Label': testlabels})

In [130]:
trainsentences = train_df_preprocessed["Review"].values
y_train = train_df_preprocessed["Label"].values

testsentences = test_df_preprocessed["Review"].values
y_test = test_df_preprocessed["Label"].values

# Encode labels as integers
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_test = le.transform(y_test)

# vectorizing
vectorizer = CountVectorizer(ngram_range=(2,2)) # ngram range for specifying unigrams and bigrams (1,1) - unigram, (2,2) - bigram, (1,2) - both
X_train = vectorizer.fit_transform(trainsentences)
X_test = vectorizer.transform(testsentences)

## Multinomial naive Bayes

### Train and accuracy without feature selection

In [131]:
# parameters
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # Laplace smoothing parameter
    'fit_prior': [True, False]
}
multinomial_naive_bayes = MultinomialNB()

# Set up GridSearchCV
grid_search = GridSearchCV(multinomial_naive_bayes, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'alpha': 0.1, 'fit_prior': True}
Best Cross-Validation Score: 0.740625
Test Set Score: 0.7625


In [132]:
# fit the model
multinomial_naive_bayes = MultinomialNB( alpha = 0.1, fit_prior = True)
multinomial_naive_bayes.fit(X_train, y_train)
# predictions
y_pred = multinomial_naive_bayes.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.7625
Precision: 0.8088
Recall: 0.6875
F1 Score: 0.7432


### Train and accuracy with chi squared test for feature selection

In [133]:
# Perform Chi-squared feature selection
chi2_selector = SelectKBest(chi2, k=2000) #2000 features perform best
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Train a Multinomial Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train_chi2, y_train)

# Predict on the test set
y_pred = nb_model.predict(X_test_chi2)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)

# Get the feature names for the selected features
selected_feature_indices = chi2_selector.get_support(indices=True)
selected_feature_names = vectorizer.get_feature_names_out()[selected_feature_indices]
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Selected Features: {selected_feature_names}")
print(f"Test Set Accuracy: {accuracy}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Selected Features: ['able relax' 'accommodate us' 'across street' ... 'young people'
 'youre going' 'youre traveler']
Test Set Accuracy: 0.725
Precision: 0.7370
Recall: 0.7250
F1 Score: 0.7215


In [134]:
# parameters
param_grid = {
    'alpha': [0.1, 0.5, 1.0, 1.5, 2.0],  # Laplace smoothing parameter
    'fit_prior': [True, False]
}
multinomial_naive_bayes = MultinomialNB()

# Set up GridSearchCV
grid_search = GridSearchCV(multinomial_naive_bayes, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train_chi2, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test_chi2, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'alpha': 0.1, 'fit_prior': True}
Best Cross-Validation Score: 0.9796875
Test Set Score: 0.6875


In [135]:
# fit the model
multinomial_naive_bayes = MultinomialNB( alpha = 0.1, fit_prior = True)
multinomial_naive_bayes.fit(X_train_chi2, y_train)
# predictions
y_pred = multinomial_naive_bayes.predict(X_test_chi2)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6875
Precision: 0.6918
Recall: 0.6875
F1 Score: 0.6857


### Feature importance

In [136]:
# Get log probabilities
log_probs = multinomial_naive_bayes.feature_log_prob_

feature_names = chi2_selector.get_feature_names_out()

# Calculate log-odds ratio (Class 1 log-probs minus Class 0 log-probs)
log_odds = log_probs[1, :] - log_probs[0, :]  # Class 1 (Genuine) vs Class 0 (Deceptive)

# Convert to DataFrame
log_odds_df = pd.DataFrame(log_odds, index=feature_names, columns=['log_odds'])

# Sort by log-odds
deceptive_features = log_odds_df.sort_values(by='log_odds', ascending=True).head()  # More negative log-odds -> Deceptive
genuine_features = log_odds_df.sort_values(by='log_odds', ascending=False).head()  # More positive log-odds -> Genuine

# Print the top features for each class based on log-odds
print("Top features for Deceptive Reviews:")
print(deceptive_features)

print("\nTop features for Genuine Reviews:")
print(genuine_features)

Top features for Deceptive Reviews:
        log_odds
x6315  -5.147383
x32746 -5.047798
x17780 -4.812805
x22810 -4.812805
x21711 -4.812805

Top features for Genuine Reviews:
        log_odds
x4121   5.079672
x14290  4.715334
x14821  4.715334
x9408   4.715334
x4914   4.598924


## Logistic Regression

### Train and accuracy

In [137]:
# parameters
param_grid = {
    'C': [0.001, 0.01, 0.1, 1.0, 5, 10]
}

logistic_regression = LogisticRegression(penalty='l1', solver='liblinear')

# Set up GridSearchCV
grid_search = GridSearchCV(logistic_regression, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

Best Parameters: {'C': 10}
Best Cross-Validation Score: 0.7421875
Test Set Score: 0.65


In [138]:
# fit the model
logistic_regression = LogisticRegression(penalty='l1', solver='liblinear', C=10)
logistic_regression.fit(X_train, y_train)
# predictions
y_pred = logistic_regression.predict(X_test)
accuracy = accuracy_score(y_test ,y_pred)
precision = precision_score(y_test ,y_pred,average='binary')
recall = recall_score(y_test ,y_pred,average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6562
Precision: 0.6068
Recall: 0.8875
F1 Score: 0.7208


In [139]:
# Check number of non-zero coefficients
non_zero_coefficients = (logistic_regression.coef_ != 0).sum()
total_coefficients = logistic_regression.coef_.size
print(f"Non-zero coefficients: {non_zero_coefficients}/{total_coefficients}")


Non-zero coefficients: 375/42436


### Feature importance

In [140]:
# Get the coefficients
coefficients = logistic_regression.coef_[0]
feature_names = vectorizer.get_feature_names_out()

# Create a DataFrame for better visualization
coef_df = pd.DataFrame({'feature': feature_names, 'coefficient': coefficients})

# Separate features for genuine and deceptive reviews
genuine_features = coef_df[coef_df['coefficient'] > 0].sort_values(by='coefficient', ascending=False)
deceptive_features = coef_df[coef_df['coefficient'] < 0].sort_values(by='coefficient')

# Display the results
print("Important Features for Genuine Reviews:")
print(genuine_features.head())

print("\nImportant Features for Deceptive Reviews:")
print(deceptive_features.head())

Important Features for Genuine Reviews:
              feature  coefficient
30997      rooms also     4.239152
21175  location worth     4.157442
36694        th floor     4.118604
22268      many times     3.728791
31478       said room     3.379701

Important Features for Deceptive Reviews:
                  feature  coefficient
21711        luxury hotel    -6.431562
6315   chicago millennium    -6.207852
19359       james chicago    -6.142639
28835      really looking    -6.011738
36200   swissotel chicago    -5.719355


## Classification Trees

### Train and accuracy

In [141]:
# parameters
param_grid = {
    'criterion': ['gini', 'entropy'],  # Function to measure the quality of a split
    'max_depth': [None, 2, 4, 6, 8, 10],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4, 6],  # Minimum number of samples required to be at a leaf node
    'ccp_alpha': np.linspace(0, 0.1, 11) 
}


decision_tree = DecisionTreeClassifier(criterion='gini', max_depth=None, random_state=42)

# Set up GridSearchCV
grid_search = GridSearchCV(decision_tree, param_grid, cv=10, scoring='accuracy')
grid_search.fit(X_train, y_train)

# Best parameters and score
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)

# Evaluate on test set
test_score = grid_search.score(X_test, y_test)
print("Test Set Score:", test_score)

KeyboardInterrupt: 

In [142]:
# fit the model
decision_tree = DecisionTreeClassifier(ccp_alpha= 0.0, criterion= 'gini', max_depth= 8, min_samples_leaf= 6, min_samples_split= 2, random_state=42)
decision_tree.fit(X_train, y_train)
# predictions
y_pred = decision_tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='binary')
recall = recall_score(y_test, y_pred, average='binary')
f1 = f1_score(y_test, y_pred, average='binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6062
Precision: 0.5649
Recall: 0.9250
F1 Score: 0.7014


### Feature Importance

In [143]:
X_train_arr = X_train.toarray() 
X_test_arr = X_test.toarray()

perm_importance = permutation_importance(decision_tree, X_test_arr, y_test, n_repeats=10, random_state=42)

feature_names = vectorizer.get_feature_names_out()
# Display the importance scores for the positive class
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)

print("\nPermutation Importance:")
print(importance_df.head())

KeyboardInterrupt: 

## Random Forest

### Train and accuracy

In [144]:

# parameters
param_grid = {
    'n_estimators': [50, 100],  # Number of trees in the forest
    'max_features': [ 'sqrt', 'log2'],  # Number of features to consider for the best split
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20],  # Maximum depth of the tree
    'min_samples_split': [2, 5],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2]  # Minimum number of samples required to be at a leaf node
}


# Store results
results = []

# Loop through hyperparameter combinations
for params in ParameterGrid(param_grid):
    model = RandomForestClassifier(oob_score=True, random_state=42, **params)
    model.fit(X_train, y_train)
    
    # Record OOB score
    oob_score = model.oob_score_
    results.append({**params, 'oob_score': oob_score})

# Convert results to DataFrame for easier analysis
results_df = pd.DataFrame(results)

# Find the best hyperparameters based on OOB score
best_params = results_df.loc[results_df['oob_score'].idxmax()]
print("Best Hyperparameters:\n", best_params)

Best Hyperparameters:
 criterion            entropy
max_depth                NaN
max_features            sqrt
min_samples_leaf           2
min_samples_split          2
n_estimators             100
oob_score             0.7875
Name: 53, dtype: object


In [147]:
# fit the model
random_forest = RandomForestClassifier(criterion = 'entropy', max_depth = None, max_features="sqrt", min_samples_leaf=2, min_samples_split=2, n_estimators=100, random_state = 42)
random_forest.fit(X_train, y_train)
# predictions
y_pred = random_forest.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average = 'binary')
recall = recall_score(y_test, y_pred, average = 'binary')
f1 = f1_score(y_test, y_pred,average = 'binary')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

Accuracy: 0.6937
Precision: 0.6281
Recall: 0.9500
F1 Score: 0.7562


### Feature Importance

In [96]:
perm_importance = permutation_importance(random_forest, X_test_arr, y_test, n_repeats=10, random_state=42)

feature_names = vectorizer.get_feature_names_out()
# Display the importance scores for the positive class
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': perm_importance.importances_mean
}).sort_values(by='Importance', ascending=False)

print("\nPermutation Importance:")
print(importance_df.head())


Permutation Importance genuine:
       Feature  Importance
1020   chicago    0.051250
3544  location    0.024375
1093   cleaned    0.016250
5585     smell    0.016250
5320    seemed    0.014375

Permutation Importance deceptive:
      Feature  Importance
2220     face   -0.006875
774    broken   -0.006875
3647     make   -0.008750
349   arrived   -0.009375
2457    found   -0.013125
