Loading and preprocessing the data

In [11]:
import os
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.neural_network import MLPClassifier


In [3]:
def load_imdb_data(data_dir):
    texts = []
    labels = []

    for label_type in ['pos', 'neg']:
        dir_name = os.path.join(data_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname.endswith(".txt"):
                with open(os.path.join(dir_name, fname), encoding="utf8") as f:
                    texts.append(f.read())
                labels.append(1 if label_type == 'pos' else 0)
    
    return texts, labels

# Example usage
train_texts, train_labels = load_imdb_data("data/train")
test_texts, test_labels = load_imdb_data("data/test")

In [4]:
X_train, X_val, y_train, y_val = train_test_split(
    train_texts, train_labels, test_size=0.2, random_state=42
)

In [5]:
# 4. Preprocess with TF-IDF
vectorizer = TfidfVectorizer(lowercase=True, stop_words='english', max_features=10000)

X_train_vec = vectorizer.fit_transform(X_train)
X_val_vec = vectorizer.transform(X_val)
X_test_vec = vectorizer.transform(test_texts)

In [6]:
# 5. Train and Evaluate kNN
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_vec, y_train)
y_pred_knn = knn.predict(X_test_vec)
print("kNN Results:")
print(classification_report(test_labels, y_pred_knn))

kNN Results:
              precision    recall  f1-score   support

           0       0.61      0.73      0.67     12500
           1       0.67      0.55      0.60     12500

    accuracy                           0.64     25000
   macro avg       0.64      0.64      0.63     25000
weighted avg       0.64      0.64      0.63     25000



In [7]:
# 6. Train and Evaluate Logistic Regression
logreg = LogisticRegression(C=1.0, max_iter=1000)
logreg.fit(X_train_vec, y_train)

y_pred_logreg = logreg.predict(X_test_vec)
print("📊 Logistic Regression Results:")
print(classification_report(test_labels, y_pred_logreg))

📊 Logistic Regression Results:
              precision    recall  f1-score   support

           0       0.88      0.87      0.88     12500
           1       0.87      0.88      0.88     12500

    accuracy                           0.88     25000
   macro avg       0.88      0.88      0.88     25000
weighted avg       0.88      0.88      0.88     25000



In [18]:
# 7. Feedforward neural network (MLP classifier)
mlp = MLPClassifier(
    hidden_layer_sizes=(50,),
    activation='relu',
    solver='adam',
    max_iter=300,
    early_stopping=True,
    alpha=0.0005,
    random_state=42
)
mlp.fit(X_train_vec, y_train)
y_pred_mlp = mlp.predict(X_test_vec)

print("MLP (Feedforward Neural Network) Results:")
print(classification_report(test_labels, y_pred_mlp))

MLP (Feedforward Neural Network) Results:
              precision    recall  f1-score   support

           0       0.85      0.89      0.87     12500
           1       0.88      0.85      0.87     12500

    accuracy                           0.87     25000
   macro avg       0.87      0.87      0.87     25000
weighted avg       0.87      0.87      0.87     25000

