# IMPORTS

In [74]:
import nltk
from nltk.corpus import movie_reviews as mr
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import string

In [75]:
import warnings
warnings.filterwarnings("ignore", category=RuntimeWarning, message="overflow encountered in cast")

In [76]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

In [77]:
# Ensure we have the necessary datasets and tools from NLTK
nltk.download('movie_reviews')
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/rogerbaigess/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rogerbaigess/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rogerbaigess/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
# Crear data y labels
documents = [(list(mr.words(fileid)), category)
             for category in mr.categories()
             for fileid in mr.fileids(category)]
data = [' '.join(words) for words, _ in documents]
labels = [category for _, category in documents]

# Dividir en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, random_state=42, stratify=labels)

In [79]:
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))

# Mejorar la eliminación de stopwords incluyendo filtrado por categoría gramatical
def improved_preprocess(document):
    # Remove numbers
    document = re.sub(r'\d+', '', document)
    
    # Remove punctuation and convert to lowercase
    document = document.translate(str.maketrans('', '', string.punctuation)).lower()
    
    # Tokenize the document
    tokens = word_tokenize(document)
    
    # Initialize the lemmatizer
    lemmatizer = WordNetLemmatizer()
    
    # Remove stopwords and lemmatize the tokens
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Combine the filtered tokens back into a single string
    return ' '.join(filtered_tokens)

# Aplicar preprocesamiento a los conjuntos de entrenamiento y prueba
data_train_processed = [improved_preprocess(doc) for doc in X_train]
data_test_processed = [improved_preprocess(doc) for doc in X_test]


In [80]:
# Use CountVectorizer to convert data to a matrix of token counts
vectorizer = CountVectorizer(min_df=5)
data_train_processed = vectorizer.fit_transform(data_train_processed)
data_test_processed = vectorizer.transform(data_test_processed)

In [84]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score


# Initialize models
rf_clf = RandomForestClassifier(random_state=42)
knn_clf = KNeighborsClassifier()
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
svm_clf = SVC()


# List of models
classifiers = {
    "Random Forest": (rf_clf, False),
    "K-Nearest Neighbors": (knn_clf, False),
    "SVM": (svm_clf, False),
}

def train_evaluate_models():
    # Train and evaluate models
    for model_name, (model, encode_tag) in classifiers.items():
        print(f"Training {model_name}...")
        model.fit(data_train_processed, y_train)
        predictions = model.predict(data_test_processed)
        
        # Evaluation
        print(f"Evaluating {model_name}...")
        print("Confusion Matrix:")
        print(confusion_matrix(y_test, predictions))
        print("Classification Report:")
        print(classification_report(y_test, predictions))
        print("Accuracy Score:", accuracy_score(y_test, predictions))
        print("\n")

train_evaluate_models()


Training Random Forest...
Evaluating Random Forest...
Confusion Matrix:
[[162  38]
 [ 25 175]]
Classification Report:
              precision    recall  f1-score   support

         neg       0.87      0.81      0.84       200
         pos       0.82      0.88      0.85       200

    accuracy                           0.84       400
   macro avg       0.84      0.84      0.84       400
weighted avg       0.84      0.84      0.84       400

Accuracy Score: 0.8425


Training K-Nearest Neighbors...
Evaluating K-Nearest Neighbors...
Confusion Matrix:
[[161  39]
 [106  94]]
Classification Report:
              precision    recall  f1-score   support

         neg       0.60      0.81      0.69       200
         pos       0.71      0.47      0.56       200

    accuracy                           0.64       400
   macro avg       0.65      0.64      0.63       400
weighted avg       0.65      0.64      0.63       400

Accuracy Score: 0.6375


Training SVM...
Evaluating SVM...
Confusion Matr

In [83]:
from sklearn.preprocessing import LabelEncoder

# Convert labels to numeric values
encoder = LabelEncoder()
y_train_encoded = encoder.fit_transform(y_train)
y_test_encoded = encoder.transform(y_test)

# Initialize XGBoost classifier
xgb_clf = XGBClassifier(use_label_encoder=False, eval_metric='logloss')

# Fit the model
xgb_clf.fit(data_train_processed, y_train_encoded)

# Predict the labels of the test set
xgb_pred = xgb_clf.predict(data_test_processed)

# Convert the predictions back to the original labels
xgb_pred_labels = encoder.inverse_transform(xgb_pred.astype(int))

# Evaluation metrics
print("Confusion Matrix:")
print(confusion_matrix(y_test, xgb_pred_labels))
print("\nClassification Report:")
print(classification_report(y_test, xgb_pred_labels))
print("\nAccuracy Score:", accuracy_score(y_test, xgb_pred_labels))

Confusion Matrix:
[[168  32]
 [ 30 170]]

Classification Report:
              precision    recall  f1-score   support

         neg       0.85      0.84      0.84       200
         pos       0.84      0.85      0.85       200

    accuracy                           0.84       400
   macro avg       0.85      0.84      0.84       400
weighted avg       0.85      0.84      0.84       400


Accuracy Score: 0.845
