Read Bollywood (Indian Movie in Hindi Language) Dataset containing Cast and crew with Box office Gross.  
First we need to load full datasets from CSV file. Later we can choose some Gross value field as target for prediction.


In [None]:
import numpy as np
import pandas as pd

full_dataframe = pd.read_csv("../input/bollywood-movie-dataset/Movie.csv", index_col=0)

full_dataframe.head(10)

Make data preparation:
1. Drop column 'Title'. In this task I didn't account this column using text processing algorithms.
2. Correct currency columns and convert them to numeric data type
3. Fix 'Release Date' column and split it into Year, Month and Day features.
4. Use One Hot encoding to encode all comma-separated columns, like 'Director', into thousands of feature columns.
5. Use One Hot encoding to encode all 'Actors' columns. Seven 'Actors' columns have the same meaning and should be encoded in one operation.
6. Use One Hot encoding (pd.get_dummies) to encode simple string columns as 'Genre', 'Production Banner'.

In [None]:
def get_object_cols(df):
    return [cname for cname in df.columns if df[cname].dtype == "object"]

def encode_comma_separated_cols(df, max_cols=10000):
    comma_separated_cols = ['Dialogue', 'Director', 'Directors', 'Lyrics', 'Music', 'Producer', 'Screenplay', 'Story']
    for col in comma_separated_cols:
        oh_cols = (df[col].str.split(pat=",", expand=True)
            .apply(pd.Series.value_counts, 1)
            .iloc[:, 1:]
            .fillna(0, downcast='infer')
            .add_prefix(col + ' '))
        
        oh_cols_count = min(oh_cols.shape[1], max_cols)
        oh_cols = oh_cols.sample(oh_cols_count, axis=1)
        
        print("Count of %s features: %d" % (col, oh_cols_count))

        if oh_cols_count > 1:
            df = df.drop(col, axis=1)
            df = pd.concat([df, oh_cols], axis=1)

    return df

def encode_actors_cols(df, max_cols=10000):
    actors_cols = ['Actors ' + str(x) for x in range(1, 8)]
    
    oh_actors = (df[actors_cols]
        .apply(pd.Series.value_counts, 1)
        .iloc[:, 1:]
        .fillna(0, downcast='infer')
        .add_prefix('Actor '))

    oh_cols_count = min(oh_actors.shape[1], max_cols)
    oh_actors = oh_actors.sample(oh_cols_count, axis=1)

    print("Count of %s features: %d" % ('Actors', oh_cols_count))

    df = df.drop(actors_cols, axis=1)
    df = pd.concat([df, oh_actors], axis=1)

    return df

def encode_object_cols(df):
    object_cols = get_object_cols(df)

    oh_cols = pd.get_dummies(df[object_cols])

    df = df.drop(object_cols, axis=1)
    df = pd.concat([df, oh_cols], axis=1)
    
    return df

def pre_processing(df):
    # Drop useless column
    df = df.drop('Title', axis=1)

    # Fixing currency columns
    currency_cols = ['Budget', 'First Day', 'First Week', 'First Weekend', 'India Gross', "Overseas Gross", 'Worldwide Gross']

    for col in currency_cols:
        df[col] = pd.to_numeric(df[col].str.lstrip('$').str.replace(',', '').str.replace('-', '').replace("", np.nan))

    df.Budget.fillna(df.Budget.mean(), inplace=True)
        
    # Fixing time and date columns
    df['Runtime'] = pd.to_numeric(df['Runtime'].str.rstrip(' min'))
    df['Release Year'] = pd.DatetimeIndex(df['Release Date']).year
    df['Release Month'] = pd.DatetimeIndex(df['Release Date']).month
    df['Release Day'] = pd.DatetimeIndex(df['Release Date']).day
    df = df.drop('Release Date', axis=1)

    for col in get_object_cols(df):
        df[col] = df[col].str.strip()
    
    df = encode_comma_separated_cols(df)    
    df = encode_actors_cols(df)
    df = encode_object_cols(df)

    return df

df = pre_processing(full_dataframe.copy())

Let's choose 'India Gross' as prediction target and split prepared data to train and validation datasets.

In [None]:
from sklearn.model_selection import train_test_split

target_cols = ['First Day', 'First Week', 'First Weekend', 'India Gross', "Overseas Gross", 'Worldwide Gross']

# Remove rows with missing target, separate target from predictors
i_gross_df = df.dropna(axis=0, subset=['India Gross'])

X_full = i_gross_df.drop(target_cols, axis=1)
y_full = i_gross_df['India Gross']

X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.1)

Some preparation has beed made here for feature selection using LinearSVC. But I disabled feature selection because it didn't bring significant results.

In [None]:
from sklearn.svm import LinearSVC
#from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel

def select_features_l1(X, y):
    """Return selected features using logistic regression with an L1 penalty."""
    #logistic = LogisticRegression(C=0.5, penalty="l1", solver='liblinear', random_state=7).fit(X, y)
    lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
    selector = SelectFromModel(logistic, prefit=True)

    X_new = selector.transform(X) 

    # Get back the kept features as a DataFrame with dropped columns as all 0s
    selected_features = pd.DataFrame(selector.inverse_transform(X_new), 
                                     index=X.index, 
                                     columns=X.columns)

    selected_columns = selected_features.columns[selected_features.var() != 0]
    return selected_columns

is_select_features = False
if is_select_features:
    print("Feature selection")

    #n_samples = 300
    #X, y = p_train[feature_cols][:n_samples], p_train["SalePrice"][:n_samples]
    selected = select_features_l1(X_train, y_train)

    dropped_columns = feature_cols.drop(selected)

    selected_features_train = p_train.drop(dropped_columns, axis=1)
    selected_features_valid = p_valid.drop(dropped_columns, axis=1)

    print('Selected features:', list(selected_features_train.columns))

This code allows to find best parameters for RandomForestRegressor using GridSearchCV. Skip this code to speed up notebook execution.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

find_best_params  = False
if find_best_params:
    n_samples = 1000

    parameter_grid = {
                'max_depth': [10, 15, 20],
                'n_estimators': [50, 100, 200]
            }
    
    rf_reg = RandomForestRegressor()
    grid_searcher = GridSearchCV(rf_reg, parameter_grid, verbose=2)
    grid_searcher.fit(X_train[:n_samples], y_train[:n_samples])
    rf_reg_best = grid_searcher.best_estimator_

    print('Best params = ', rf_reg_best.get_params())

Used best hyperparameters for regressor I've found on previous step.

In [None]:
from sklearn.ensemble import RandomForestRegressor

# Using best parameters
rf_reg_params = {'n_estimators': 100,
                 'max_depth': 15}

rf_reg = RandomForestRegressor(**rf_reg_params)

rf_reg.fit(X_train, y_train)

score = rf_reg.score(X_train, y_train)  
print("Training score: ", score)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

test_pred = rf_reg.predict(X_test)
test_score = mean_absolute_error(y_test, test_pred)
print("Test MAE score:", test_score)

test_score = mean_squared_error(y_test, test_pred, squared=False)
print("Test MSE score:", test_score)

test_score = r2_score(y_test, test_pred)
print("Test R2 score:", test_score)

We've got R2 score 0.68. This is greather than 0.5 and means that **our model is not perfect but acceptable for solving task**.  
Let's try another GradientBoostingRegressor.

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb_reg_params = {'n_estimators': 500}

gb_reg = GradientBoostingRegressor(**gb_reg_params)
gb_reg.fit(X_train, y_train)

score = gb_reg.score(X_train, y_train)  
print("Training score: ", score)

In [None]:
test_pred = gb_reg.predict(X_test)
test_score = mean_absolute_error(y_test, test_pred)
print("Test MAE score:", test_score)

test_score = mean_squared_error(y_test, test_pred, squared=False)
print("Test MSE score:", test_score)

test_score = r2_score(y_test, test_pred)
print("Test R2 score:", test_score)

We've got R2 score 0.78! This result much closer to 0.8 and means that our **model performs well for this task**.  
After that we will visualize the results for GradientBoostingRegressor. To do that we will first compute the test set deviance and then plot it against boosting iterations.

In [None]:
import matplotlib.pyplot as plt

test_score = np.zeros((gb_reg_params['n_estimators'],), dtype=np.float64)
for i, y_pred in enumerate(gb_reg.staged_predict(X_test)):
    test_score[i] = gb_reg.loss_(y_test, y_pred)

fig = plt.figure(figsize=(6, 6))
plt.subplot(1, 1, 1)
plt.title('Deviance')
plt.plot(np.arange(gb_reg_params['n_estimators']) + 1, gb_reg.train_score_, 'b-',
         label='Training Set Deviance')
plt.plot(np.arange(gb_reg_params['n_estimators']) + 1, test_score, 'r-',
         label='Test Set Deviance')
plt.legend(loc='upper right')
plt.xlabel('Boosting Iterations')
plt.ylabel('Deviance')
fig.tight_layout()
plt.show()

Try using XGBRegressor instead of GradientBoostingRegressor.

In [None]:
from xgboost import XGBRegressor

# Trying XGBRegressor
xgb_reg = XGBRegressor()

xgb_reg.fit(X_train, y_train)

score = xgb_reg.score(X_train, y_train)  
print("Training score: ", score)

In [None]:
test_pred = xgb_reg.predict(X_test)
test_score = mean_absolute_error(y_test, test_pred)
print("Test MAE score:", test_score)

test_score = mean_squared_error(y_test, test_pred, squared=False)
print("Test MSE score:", test_score)

test_score = r2_score(y_test, test_pred)
print("Test R2 score:", test_score)

Obviously GradientBoostingRegressor is better suited for this task.