In [1]:
import pandas as pd
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# -------------------------
# 1. Load the Data Files
# -------------------------
# Read the training, validation, and test data from CSV files.
train_data = pd.read_csv('train_data.csv')
validation_data = pd.read_csv('validation_data.csv')
test_data = pd.read_csv('test_data.csv')

# -------------------------
# 2. Define Features and Target
# -------------------------
# Specify which columns will be used as features and which column is the target (success_level)
features = ['runtime', 'budget', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'director1', 'actor1Name']
target = 'success_level'

# Prepare training, validation, and test sets based on the selected features and target.
# Here, the data is already split into train, validation, and test datasets.
X_train = train_data[features]
y_train = train_data[target]
X_val = validation_data[features]
y_val = validation_data[target]
X_test = test_data[features]
y_test = test_data[target]

# -------------------------
# 3. Preprocessing for Numerical Features
# -------------------------
# Define which numerical features to scale and create a pipeline to apply StandardScaler.
numeric_features = ['runtime', 'budget']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize the numerical features
])

# -------------------------
# 4. Preprocessing for Categorical Features
# -------------------------
# Define which categorical features to encode and create a pipeline for One-Hot Encoding.
categorical_features = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 'director1', 'actor1Name']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # One-Hot Encode categorical variables, ignoring unknown categories during transform
])

# -------------------------
# 5. Combine Preprocessing Steps
# -------------------------
# Use a ColumnTransformer to apply different preprocessing pipelines for numerical and categorical features.
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),   # Apply numeric_transformer to numerical features
        ('cat', categorical_transformer, categorical_features)  # Apply categorical_transformer to categorical features
    ])

# -------------------------
# 6. Create the Random Forest Model Pipeline
# -------------------------
# Combine the preprocessor and the classifier in a single pipeline.
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))  # Random Forest classifier with fixed random state for reproducibility
])

# -------------------------
# 7. Hyperparameter Tuning Using GridSearchCV
# -------------------------
# Define a parameter grid for tuning the Random Forest model.
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Create a GridSearchCV object to find the best parameters using 5-fold cross validation on the validation data.
grid_search = GridSearchCV(rf_pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)
grid_search.fit(X_val, y_val)  # Fit the GridSearch on the validation set

# Print the best parameters and the corresponding cross-validation score.
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# -------------------------
# 8. Evaluate the Model on the Test Data
# -------------------------
# Predict the target labels for the test data using the best found model.
y_test_pred = grid_search.predict(X_test)
# Compute and print the accuracy, numeric confusion matrix, and detailed classification report.
print(f'Test Accuracy: {accuracy_score(y_test, y_test_pred)}')
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 5, 'classifier__n_estimators': 100}
Best cross-validation score: 0.55
Test Accuracy: 0.5316600114090131
[[  3   2 288]
 [  2  15 505]
 [  5  19 914]]
              precision    recall  f1-score   support

 Blockbuster       0.30      0.01      0.02       293
        Flop       0.42      0.03      0.05       522
     Success       0.54      0.97      0.69       938

    accuracy                           0.53      1753
   macro avg       0.42      0.34      0.25      1753
weighted avg       0.46      0.53      0.39      1753

