In [17]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline

# Load data
X_train = pd.read_csv('https://github.com/msaricaumbc/DS_data/raw/master/ds602/movie/X_train.csv')
y_train = pd.read_csv('https://github.com/msaricaumbc/DS_data/raw/master/ds602/movie/y_train.csv')
X_test = pd.read_csv('https://github.com/msaricaumbc/DS_data/raw/master/ds602/movie/X_final.csv')
y_test = pd.read_csv('https://github.com/msaricaumbc/DS_data/raw/master/ds602/movie/y_final.csv')



In [18]:
#Choosing data points due to heavy volume of data.

subset_size = 0.1  
X_train, _, y_train, _ = train_test_split(X_train, y_train, train_size=subset_size, random_state=42)



In [19]:
X_combined = pd.concat([X_train, X_test], axis=0)

#Encoding string values
label_encoder = LabelEncoder()
for column in X_combined.columns:
    if X_combined[column].dtype == 'object':
        label_encoder.fit(X_combined[column])
        X_train[column] = label_encoder.transform(X_train[column])
        X_test[column] = label_encoder.transform(X_test[column])

# Encoding target labels
label_encoder_y = LabelEncoder()
y_train_encoded = label_encoder_y.fit_transform(y_train.values.ravel())
y_test_encoded = label_encoder_y.transform(y_test.values.ravel())



In [20]:
# Defining the models and parameter grids
models = {
    'Logistic Regression': (LogisticRegression(), {'clf__C': [0.1, 1, 10]}),
    'SVM': (SVC(), {'clf__C': [0.1, 1, 10], 'clf__gamma': [0.01, 0.1, 1]}),
    'KNN': (KNeighborsClassifier(), {'clf__n_neighbors': [3, 5, 7, 9]}),
    'Naive Bayes': (MultinomialNB(), {}),
    'Decision Tree': (DecisionTreeClassifier(), {'clf__max_depth': [None, 10, 20]})
}




In [21]:
# Performing GridSearchCV 
results = {}
for model_name, (model, param_grid) in models.items():
    pipeline = Pipeline([
        ('clf', model)
    ])
    grid_search = GridSearchCV(pipeline, param_grid, cv=3, n_jobs=-1)
    grid_search.fit(X_train, y_train_encoded)
    results[model_name] = grid_search



In [22]:
for model_name, grid_search in results.items():
    accuracy = grid_search.score(X_test, y_test_encoded)
    print(f'{model_name} - Test Accuracy: {accuracy}')
    print('Best hyperparameters:')
    print(grid_search.best_params_)
    print()

Logistic Regression - Test Accuracy: 0.4999
Best hyperparameters:
{'clf__C': 0.1}

SVM - Test Accuracy: 0.5331
Best hyperparameters:
{'clf__C': 10, 'clf__gamma': 1}

KNN - Test Accuracy: 0.5428
Best hyperparameters:
{'clf__n_neighbors': 3}

Naive Bayes - Test Accuracy: 0.5
Best hyperparameters:
{}

Decision Tree - Test Accuracy: 0.5419
Best hyperparameters:
{'clf__max_depth': None}



The KNN model achieved the highest test accuracy among all models, with an accuracy of 54.28%.

The SVM model also performed relatively well with a test accuracy of 53.31%.

The Decision Tree model had a comparable performance to KNN, with a test accuracy of 54.19%.

Logistic Regression and Naive Bayes had lower test accuracies, around 50%, indicating that they may not be as effective for this dataset.