In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt 

In [2]:
df = pd.read_csv("../Dataset/Movie_classification.csv", header = 0)

In [3]:
df['Time_taken'].mean()

157.39149797570855

In [4]:
df['Time_taken'].fillna(df['Time_taken'].mean(), inplace=True)

In [5]:
df = pd.get_dummies(df, columns = ['3D_available','Genre'], drop_first = True)

In [6]:
X = df.loc[:, df.columns!='Start_Tech_Oscar']

In [7]:
y = df['Start_Tech_Oscar']

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=42)

In [11]:
rf.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1000,
                       n_jobs=-1, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [12]:
confusion_matrix(y_test, rf.predict(X_test))

array([[25, 19],
       [18, 40]], dtype=int64)

In [13]:
accuracy_score(y_test, rf.predict(X_test))

0.6372549019607843

<b>Using grid search for optimized solution </b>

In [19]:
from sklearn.model_selection import GridSearchCV

In [15]:
rf_cls = RandomForestClassifier(n_estimators= 250, random_state= 42)

In [24]:
params_grid = {'max_features':[4,5,6,7,8,9,10],
               'min_samples_split': [2,3,10],
              }

In [25]:
grid_search = GridSearchCV(rf_cls, params_grid, n_jobs=-1, cv=5, scoring='accuracy')

In [26]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=250, n_jobs=None,
                                              oob_score=False, random_state=42,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'max_f

In [27]:
grid_search.best_params_

{'max_features': 7, 'min_samples_split': 3}

In [28]:
cv_cls = grid_search.best_estimator_

In [29]:
accuracy_score(y_test, cv_cls.predict(X_test))

0.6372549019607843

In [30]:
confusion_matrix(y_test, cv_cls.predict(X_test))

array([[27, 17],
       [20, 38]], dtype=int64)