In [None]:
import os
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Function to load reviews from dataset directory
def load_reviews(directory):
    reviews, labels = [], []
    for label in ['pos', 'neg']:
        path = os.path.join(directory, label)
        for filename in os.listdir(path):
            with open(os.path.join(path, filename), 'r', encoding='utf-8') as file:
                reviews.append(file.read())
                labels.append(1 if label == 'pos' else 0)  # 1 for positive, 0 for negative
    return reviews, labels

# Paths to dataset directories
train_dir = r"Movie review\aclImdb\train"
test_dir = r"Movie review\aclImdb\test"

# Load training and testing data
X_train_text, y_train = load_reviews(train_dir)
X_test_text, y_test = load_reviews(test_dir)

# Convert text into numerical feature vectors using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

# Train Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
dt_model.fit(X_train, y_train)

# Predict on test set
y_pred_dt = dt_model.predict(X_test)

# Evaluate Decision Tree Model
print("\nDecision Tree Classifier:")
print("Accuracy:", accuracy_score(y_test, y_pred_dt))
print(classification_report(y_test, y_pred_dt))


Decision Tree Classifier:
Accuracy: 0.7128
              precision    recall  f1-score   support

           0       0.71      0.72      0.71     12500
           1       0.72      0.71      0.71     12500

    accuracy                           0.71     25000
   macro avg       0.71      0.71      0.71     25000
weighted avg       0.71      0.71      0.71     25000

