In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV  # For splitting data and hyperparameter tuning
from sklearn.preprocessing import StandardScaler, OneHotEncoder  # For scaling numerical data and encoding categorical data
from sklearn.compose import ColumnTransformer  # To combine transformations for different feature types
from sklearn.pipeline import Pipeline  # To create a pipeline that chains data transformations and model fitting
from sklearn.svm import SVC  # Support Vector Classifier (SVM)
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report  # For model evaluation

# -----------------------------------------
# 1. Load the Data
# -----------------------------------------
# Read the combined dataset that includes a column 'dataset' indicating train, validation, or test split
data = pd.read_csv('combined_df.csv')

# -----------------------------------------
# 2. Define Features and Target
# -----------------------------------------
# List the features to be used for modeling and specify the target variable
features = ['runtime', 'budget', 'genres', 'production_companies', 'production_countries', 
            'spoken_languages', 'director1', 'actor1Name']
target = 'success_level'

# Split the data into training, validation, and test sets based on the 'dataset' column
train_data = data[data['dataset'] == 'train']
validation_data = data[data['dataset'] == 'validation']
test_data = data[data['dataset'] == 'test']

# Separate features (X) and target (y) for each dataset
X_train = train_data[features]
y_train = train_data[target]
X_val = validation_data[features]
y_val = validation_data[target]
X_test = test_data[features]
y_test = test_data[target]

# -----------------------------------------
# 3. Preprocessing Pipelines
# -----------------------------------------
# Define the numerical features and create a pipeline for scaling them
numeric_features = ['runtime', 'budget']
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())  # Standardize numerical features by removing the mean and scaling to unit variance
])

# Define the categorical features and create a pipeline for one-hot encoding them
categorical_features = ['genres', 'production_companies', 'production_countries', 'spoken_languages', 
                        'director1', 'actor1Name']
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  # Convert categorical variables into a one-hot numeric array
])

# Combine the numerical and categorical pipelines into a single preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),      # Apply numeric_transformer to numeric_features
        ('cat', categorical_transformer, categorical_features) # Apply categorical_transformer to categorical_features
    ]
)

# -----------------------------------------
# 4. Create the SVM Model Pipeline
# -----------------------------------------
# Build a pipeline that first preprocesses the data and then fits a Support Vector Classifier with a linear kernel.
svm_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Apply the combined preprocessing steps
    ('classifier', SVC(kernel='linear', random_state=42))  # SVM classifier with a linear kernel and fixed random state
])

# -----------------------------------------
# 5. Hyperparameter Tuning via GridSearchCV
# -----------------------------------------
# Set up a parameter grid for tuning hyperparameters of the SVM model
param_grid = {
    'classifier__C': [0.1, 1, 10],  # Regularization parameter values
    'classifier__gamma': ['scale', 'auto'],  # Gamma parameter for kernel function (included for experimentation)
}

# Create a GridSearchCV object to search for the best hyperparameters using 5-fold cross-validation,
# optimizing for accuracy, and with verbosity to track progress.
grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, scoring='accuracy', verbose=1)
# Fit the grid search on the validation data (you can also fit on training, but here we use validation)
grid_search.fit(X_val, y_val)

# Print the best hyperparameters and the corresponding cross-validation accuracy score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score: {:.2f}".format(grid_search.best_score_))

# -----------------------------------------
# 6. Evaluate the SVM Model on Test Data
# -----------------------------------------
# Use the best estimator found by GridSearchCV to predict on the test set
y_test_pred = grid_search.predict(X_test)
# Print the test accuracy
print(f'Test Accuracy: {accuracy_score(y_test, y_test_pred)}')
# Print the numeric confusion matrix and detailed classification report for test results
print(confusion_matrix(y_test, y_test_pred))
print(classification_report(y_test, y_test_pred))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best parameters: {'classifier__C': 0.1, 'classifier__gamma': 'scale'}
Best cross-validation score: 0.47
Test Accuracy: 0.46891043924700515
[[ 75  70 263]
 [ 16 216 432]
 [ 26 124 531]]
              precision    recall  f1-score   support

 Blockbuster       0.64      0.18      0.29       408
        Flop       0.53      0.33      0.40       664
     Success       0.43      0.78      0.56       681

    accuracy                           0.47      1753
   macro avg       0.53      0.43      0.41      1753
weighted avg       0.52      0.47      0.44      1753

