# IMPORTS

In [14]:
import nltk
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import string

In [15]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, message="overflow encountered in cast")

In [16]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [17]:
# Ensure we have the necessary datasets and tools from NLTK
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/rogerbaigess/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rogerbaigess/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rogerbaigess/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [21]:

# Preprocessing function
def preprocess_documents():
    # Create a list of tuples where each tuple is a document represented by a list of words and its label
    documents = [(list(mr.words(fileid)), category)
                 for category in mr.categories()
                 for fileid in mr.fileids(category)]
    
    # Convert list of words to single string for each document
    documents = [(' '.join(document), category) for (document, category) in documents]
    
    # Remove punctuation and lowercase all words
    table = str.maketrans('', '', string.punctuation)
    documents = [(doc.translate(table).lower(), category) for (doc, category) in documents]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    documents = [(' '.join([word for word in doc.split() if word not in stop_words]), category)
                 for (doc, category) in documents]
    
    return documents

# Preprocess the documents
processed_documents = preprocess_documents()

# Split the preprocessed documents into data and labels
data = [doc for (doc, _) in processed_documents]
labels = [category for (_, category) in processed_documents]

# Use CountVectorizer to convert data to a matrix of token counts
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(data)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    X, labels, test_size=0.2, random_state=42, stratify=labels
)

In [28]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from interpret.glassbox import ExplainableBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score


# Initialize models
rf_clf = RandomForestClassifier(random_state=42)
knn_clf = KNeighborsClassifier()
ebm_clf = ExplainableBoostingClassifier()


# List of models
classifiers = {
    "Random Forest": rf_clf,
    "K-Nearest Neighbors": knn_clf,
}

# Train and evaluate models
for model_name, model in classifiers.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    
    # Evaluation
    print(f"Evaluating {model_name}...")
    print("Confusion Matrix:")
    print(confusion_matrix(y_test, predictions))
    print("Classification Report:")
    print(classification_report(y_test, predictions))
    print("Accuracy Score:", accuracy_score(y_test, predictions))
    print("\n")


Training Random Forest...
Evaluating Random Forest...
Confusion Matrix:
[[162  38]
 [ 38 162]]
Classification Report:
              precision    recall  f1-score   support

         neg       0.81      0.81      0.81       200
         pos       0.81      0.81      0.81       200

    accuracy                           0.81       400
   macro avg       0.81      0.81      0.81       400
weighted avg       0.81      0.81      0.81       400

Accuracy Score: 0.81


Training K-Nearest Neighbors...
Evaluating K-Nearest Neighbors...
Confusion Matrix:
[[178  22]
 [159  41]]
Classification Report:
              precision    recall  f1-score   support

         neg       0.53      0.89      0.66       200
         pos       0.65      0.20      0.31       200

    accuracy                           0.55       400
   macro avg       0.59      0.55      0.49       400
weighted avg       0.59      0.55      0.49       400

Accuracy Score: 0.5475




In [27]:
from sklearn.preprocessing import LabelEncoder

# Convert labels to numeric values
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

# Initialize XGBoost classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Fit the model
xgb_clf.fit(X_train, y_train_encoded)

# Predict the labels of the test set
xgb_pred = xgb_clf.predict(X_test)

# Convert the predictions back to the original labels
xgb_pred_labels = encoder.inverse_transform(xgb_pred.astype(int))

# Evaluation metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_pred_labels))
print("\nClassification Report:")
print(classification_report(y_test, xgb_pred_labels))
print("\nAccuracy Score:", accuracy_score(y_test, xgb_pred_labels))

Confusion Matrix:
[[164  36]
 [ 36 164]]

Classification Report:
              precision    recall  f1-score   support

         neg       0.82      0.82      0.82       200
         pos       0.82      0.82      0.82       200

    accuracy                           0.82       400
   macro avg       0.82      0.82      0.82       400
weighted avg       0.82      0.82      0.82       400


Accuracy Score: 0.82
