In [1]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Function to load reviews from dataset directory
def load_reviews(directory):
    reviews, labels = [], []
    for label in ['pos', 'neg']:
        path = os.path.join(directory, label)
        for filename in os.listdir(path):
            with open((os.path.join(path, filename)), 'r', encoding='utf-8') as file:
                reviews.append(file.read())
                labels.append(1 if label=='pos' else 0)
    return reviews, labels

# Paths to dataset directories
train_dir = r"Movie review\aclImdb\train"
test_dir = r"Movie review\aclImdb\test"

# Load training and testing data
X_train_text, y_train = load_reviews(train_dir)
X_test_text, y_test = load_reviews(test_dir)

# Convert text into numerical feature vectors using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# Train Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test set
y_pred_rf = rf_model.predict(X_test)

# Evaluate Random Forest Model
print("\nRandom Forest Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf))


Random Forest Classifier:
Accuracy: 0.84148
              precision    recall  f1-score   support

           0       0.83      0.86      0.84     12500
           1       0.86      0.82      0.84     12500

    accuracy                           0.84     25000
   macro avg       0.84      0.84      0.84     25000
weighted avg       0.84      0.84      0.84     25000

