In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')


In [None]:
# Separate features and target variable from training data
X_train = train_df.drop(columns=['Id', 'HitOrFlop'])
y_train = train_df['HitOrFlop']

# Preprocess categorical and numerical features
categorical_features = ['Actor1', 'Actor2', 'Actor3', 'Director', 'Genre', 'Language']
numerical_features = ['Budget', 'Runtime', 'LanguagesDubbedCount', 'TrailerViewCount']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)])


In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))])


In [None]:
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}


In [None]:
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


In [None]:
print(f"Best parameters: {grid_search.best_params_}")
best_model = grid_search.best_estimator_


Best parameters: {'classifier__max_depth': None, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 200}


In [None]:
# Split the training data into training and validation sets
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

best_model.fit(X_train_split, y_train_split)
y_pred = best_model.predict(X_val_split)
print(f'Accuracy: {accuracy_score(y_val_split, y_pred)}')
print(f'Confusion Matrix:\n {confusion_matrix(y_val_split, y_pred)}')
print(f'Classification Report:\n {classification_report(y_val_split, y_pred)}')


Accuracy: 1.0
Confusion Matrix:
 [[70  0]
 [ 0 70]]
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        70
           1       1.00      1.00      1.00        70

    accuracy                           1.00       140
   macro avg       1.00      1.00      1.00       140
weighted avg       1.00      1.00      1.00       140



In [None]:
X_test = test_df.drop(columns=['Id'])
predictions = best_model.predict(X_test)


In [None]:
submission = pd.DataFrame({'Id': test_df['Id'], 'HitOrFlop': predictions})
submission.to_csv('submission.csv', index=False)
