In [1]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split,cross_val_predict,cross_val_score
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
import pandas as pd
import numpy as np
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.metrics import classification_report,
from sklearn.feature_selection import SelectKBest, chi2
import statsmodels.api as sm
from sklearn.model_selection import GridSearchCV
import warnings
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [2]:
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv("combined_df.csv")
data.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,...,dir_flop,actor_total,actor_blockbuster,actor_success,actor_flop,dir_act_total,dir_act_blockbuster,dir_act_success,dir_act_flop,dataset
0,42816,The Champ,7.023,43,Released,11/21/1931,1600000,86,False,356000,...,,1.0,,1.0,,,,,,test
1,15467,Kismat Konnection,4.972,53,Released,7/18/2008,11000000,153,False,4180000,...,,8.0,1.0,6.0,,,,,,test
2,623010,Timecrowave,0.0,0,Released,5/9/2018,1000,13,False,500,...,,2.0,,2.0,,2.0,,2.0,,test
3,258832,Convention City,1.0,1,Released,12/14/1933,522000,69,False,239000,...,1.0,1.0,,,,,,,,test
4,45272,Country Strong,6.195,230,Released,12/22/2010,20601987,117,False,15000000,...,1.0,1.0,,,,,,,,test


In [4]:
data['dataset'].unique()

array(['test', 'train', 'validation', nan], dtype=object)

In [5]:
max_popularity = data['popularity'].max()
max_popularity

1175.267

In [6]:
data['popularity_pct'] = (data['popularity']*100)/max_popularity

In [7]:
columns_to_display = ['popularity', 'popularity_pct']  # Specify the columns you want to display
data[columns_to_display].head()
data['popularity_pct'].min()

0.0

In [8]:
data['avg_rating'] = (data['vote_average']+ (data['popularity_pct']/10))/2
data.head()

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,...,actor_blockbuster,actor_success,actor_flop,dir_act_total,dir_act_blockbuster,dir_act_success,dir_act_flop,dataset,popularity_pct,avg_rating
0,42816,The Champ,7.023,43,Released,11/21/1931,1600000,86,False,356000,...,,1.0,,,,,,test,0.514692,3.537235
1,15467,Kismat Konnection,4.972,53,Released,7/18/2008,11000000,153,False,4180000,...,1.0,6.0,,,,,,test,0.396165,2.505808
2,623010,Timecrowave,0.0,0,Released,5/9/2018,1000,13,False,500,...,,2.0,,2.0,,2.0,,test,0.051052,0.002553
3,258832,Convention City,1.0,1,Released,12/14/1933,522000,69,False,239000,...,,,,,,,,test,0.111294,0.505565
4,45272,Country Strong,6.195,230,Released,12/22/2010,20601987,117,False,15000000,...,,,,,,,,test,1.003432,3.147672


In [9]:
train_data = data[data['dataset'] == 'train']
validate_data = data[data['dataset'] == 'validation']
test_data = data[data['dataset'] == 'test']

In [10]:
train_data['success_level'].value_counts()

success_level
Success        3177
Flop           3074
Blockbuster    1929
Name: count, dtype: int64

In [11]:
train_data.columns

Index(['id', 'title', 'vote_average', 'vote_count', 'status', 'release_date',
       'revenue', 'runtime', 'adult', 'budget', 'imdb_id', 'original_language',
       'original_title', 'overview', 'popularity', 'tagline', 'genres',
       'production_companies', 'production_countries', 'spoken_languages',
       'keywords', 'budget_millions', 'revenue_millions', 'profit', 'Action',
       'Adventure', 'Animation', 'Comedy', 'Crime', 'Documentary', 'Drama',
       'Family', 'Fantasy', 'History', 'Horror', 'Music', 'Mystery', 'Romance',
       'Science Fiction', 'TV Movie', 'Thriller', 'War', 'Western',
       'production_companies_20th Century Fox',
       'production_companies_Columbia Pictures',
       'production_companies_Metro-Goldwyn-Mayer',
       'production_companies_New Line Cinema',
       'production_companies_Paramount',
       'production_companies_Universal Pictures',
       'production_companies_Walt Disney Pictures',
       'production_companies_Warner Bros. Pictures',
  

In [12]:
validate_data.shape

(1752, 99)

In [13]:
test_data.shape

(1753, 99)

In [14]:
data[data['dataset'].isnull()]

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,budget,...,actor_blockbuster,actor_success,actor_flop,dir_act_total,dir_act_blockbuster,dir_act_success,dir_act_flop,dataset,popularity_pct,avg_rating
11685,214674,Zone Pro Site: The Moveable Feast,6.3,18,Released,8/16/2013,11074988,145,False,2500803,...,,,,,,,,,0.219099,3.160955


In [33]:
train_data.dtypes

id                   int64
title               object
vote_average       float64
vote_count           int64
status              object
                    ...   
dir_act_success    float64
dir_act_flop       float64
dataset             object
popularity_pct     float64
avg_rating         float64
Length: 99, dtype: object

In [27]:


# List of text columns to keep
columns_to_keep = ['actor1Name','dir1Name','success_level']

# Identify all columns with text (object) data type
text_columns = train_data.select_dtypes(include=['object']).columns

# Drop text columns except the ones in the keep list
columns_to_drop = [col for col in text_columns if col not in columns_to_keep]
train_filtered = train_data.drop(columns=columns_to_drop)


In [35]:
train_filtered.dtypes

id                       int64
vote_average           float64
vote_count               int64
revenue                  int64
runtime                  int64
                        ...   
dir_act_blockbuster    float64
dir_act_success        float64
dir_act_flop           float64
popularity_pct         float64
avg_rating             float64
Length: 74, dtype: object

In [65]:
X_noy =  train_filtered.drop('success_level', axis=1)
#X_without_text_columns = X_noy.select_dtypes(exclude=['object'])
Y = train_data[['success_level']]
#X_without_text_columns.head()
X = X_noy.drop(['id','revenue','budget_millions','revenue_millions','ordering_x','ordering_y','roi','vote_average',	'vote_count','popularity','revenue'], axis=1)
#X_train = X.fillna(X.mean())
#[X.select_dtypes(include=['number']).columns] = X.select_dtypes(include=['number']).fillna(X.mean())
# Select numeric columns only
numeric_cols = X.select_dtypes(include=['number']).columns

# Fill missing values in numeric columns with the mean of each column
X[numeric_cols] = X[numeric_cols].fillna(X[numeric_cols].mean())
X.head()

Unnamed: 0,runtime,adult,budget,profit,Action,Adventure,Animation,Comedy,Crime,Documentary,...,actor_total,actor_blockbuster,actor_success,actor_flop,dir_act_total,dir_act_blockbuster,dir_act_success,dir_act_flop,popularity_pct,avg_rating
1753,88,False,30000000,-17048912,1,0,0,0,0,0,...,7.0,1.0,2.0,4.0,1.0,1.384421,1.216352,1.0,1.056015,3.199801
1754,130,False,1288000,6304465,0,0,0,0,0,0,...,5.0,1.0,2.0,1.0,1.0,1.0,1.216352,1.12952,1.260139,4.020007
1755,89,False,17000000,-4159158,0,0,0,0,0,0,...,2.0,3.136387,3.294106,1.0,1.0,1.384421,1.216352,1.0,0.938085,2.539404
1756,130,False,10000000,22613173,0,0,0,0,0,0,...,20.0,4.0,4.0,7.0,1.0,1.384421,1.0,1.12952,1.144676,3.587734
1757,101,False,25000000,34192128,0,0,0,1,0,0,...,4.0,1.0,1.0,2.579003,1.0,1.384421,1.0,1.12952,1.52008,3.029504


In [67]:
#X.columns
X["actor1Name"].astype("category")
X["dir1Name"].astype("category")

1753            Kevin Hooks
1754       Alfred Hitchcock
1755          Robert Harmon
1756           Jeff Nichols
1757          Mark Helfrich
               ...         
9928            Amal Neerad
9929         Clint Eastwood
9930    Jean-Jacques Annaud
9931          Patty Jenkins
9932         Hany Abu-Assad
Name: dir1Name, Length: 8180, dtype: category
Categories (4003, object): ['A. Bhimsingh', 'A. Karunakaran', 'A.L. Vijay', 'A.R. Murugadoss', ..., 'Édouard Molinaro', 'Émile Gaudreault', 'Ömer Faruk Sorak', 'Ümit Utku']

In [37]:
# def stepwise_selection(X_train, y_train, threshold_in=0.05, threshold_out=0.05):
#     """
#     Perform stepwise regression: Both forward and backward selection.
    
#     X_train : Training dataset with the intercept
#     y_train : Target variable
#     threshold_in : Entry p-value threshold for a feature to enter the model
#     threshold_out : Exit p-value threshold for a feature to be removed
#     """
#     initial_features = X_train.columns.tolist()
#     included = list(initial_features)
#     while True:
#         changed = False
#         # Forward step: Add features that improve the model
#         excluded = list(set(initial_features) - set(included))
#         new_pval = pd.Series(index=excluded)
#         for new_col in excluded:
#             model = sm.Logit(y_train, X_train[included + [new_col]]).fit(disp=False)
#             new_pval[new_col] = model.pvalues[new_col]
#         min_pval = new_pval.min()
#         if min_pval < threshold_in:
#             best_feature = new_pval.idxmin()
#             included.append(best_feature)
#             changed = True

#         # Backward step: Remove features that have a high p-value
#         model = sm.Logit(y_train, X_train[included]).fit(disp=False)
#         pvalues = model.pvalues[1:]  # exclude intercept
#         max_pval = pvalues.max()
#         if max_pval > threshold_out:
#             worst_feature = pvalues.idxmax()
#             included.remove(worst_feature)
#             changed = True
        
#         if not changed:
#             break
#     return included

In [39]:
# selected_features = stepwise_selection(X, Y)
# print(f"Selected features: {selected_features}")

In [41]:
#X = X.drop(['vote_average', 'vote_count','popularity', 'profit','popularity_pct', 'avg_rating','roi'], axis=1)

In [None]:
label_encoder = LabelEncoder()
Y_numeric = label_encoder.fit_transform(Y)
xgb = XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss', use_label_encoder=False,enable_categorical=True, random_state=42)
sfs = SequentialFeatureSelector(xgb,
                                k_features=10,
                                forward=True,
                                scoring='accuracy',
                               # max_iter=100,
                                cv=5)
selected_features = sfs.fit(X, Y_numeric)
print(selected_features.k_feature_names_)

In [50]:
print(selected_features.k_feature_names_)

('vote_count', 'adult', 'profit', 'Animation', 'Crime', 'Documentary', 'Family', 'Fantasy', 'roi', 'dir_act_success')


In [69]:
categorical_cols = X.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    X[col] = label_encoder.fit_transform(X[col])

xgb = XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss', use_label_encoder=False,enable_categorical=True, random_state=42)
xgb.fit(X, Y_numeric)
# Must use JSON/UBJSON for serialization, otherwise the information is lost.
y_train_pred = xgb.predict(X)
xgb.save_model("XGBoost-model.json")
xgb.feature_importances_

array([6.0960173e-04, 0.0000000e+00, 4.4676932e-03, 7.7459849e-02,
       1.6994993e-03, 7.1584817e-04, 0.0000000e+00, 5.4120412e-04,
       5.3751038e-04, 2.7502964e-03, 6.0804788e-04, 4.6617333e-03,
       1.0842740e-03, 0.0000000e+00, 1.8810080e-03, 6.7159941e-04,
       4.9250980e-04, 1.6818445e-03, 2.5845615e-03, 0.0000000e+00,
       1.5440304e-04, 0.0000000e+00, 0.0000000e+00, 0.0000000e+00,
       0.0000000e+00, 8.6966524e-05, 0.0000000e+00, 0.0000000e+00,
       3.6602495e-03, 0.0000000e+00, 0.0000000e+00, 8.1356976e-04,
       0.0000000e+00, 6.4255076e-04, 0.0000000e+00, 1.1356123e-03,
       7.8088872e-04, 2.0516974e-04, 0.0000000e+00, 0.0000000e+00,
       8.1524166e-04, 3.5374510e-04, 2.1593127e-04, 7.2017475e-04,
       1.3889993e-03, 0.0000000e+00, 2.7346707e-04, 5.7529093e-04,
       6.7200117e-02, 1.2295599e-03, 1.6853495e-03, 1.4506375e-04,
       8.6126835e-05, 7.4702362e-04, 1.1338197e-04, 5.7276653e-04,
       7.1923324e-04, 5.7210006e-02, 3.6650676e-01, 3.8699645e

In [71]:
train_accuracy = accuracy_score(y_train_pred, Y_numeric)

print(f"Training accuracy: {train_accuracy * 100:.2f}%")

Training accuracy: 100.00%


In [74]:


# List of text columns to keep
columns_to_keep = ['actor1Name','dir1Name','success_level']

# Identify all columns with text (object) data type
text_columns = validate_data.select_dtypes(include=['object']).columns

# Drop text columns except the ones in the keep list
columns_to_drop = [col for col in text_columns if col not in columns_to_keep]
validate_filtered = validate_data.drop(columns=columns_to_drop)


In [75]:
X_val_noy =  validate_filtered.drop('success_level', axis=1)
#X_without_text_columns = X_noy.select_dtypes(exclude=['object'])
Y_val = validate_data[['success_level']]
#X_without_text_columns.head()
X_val = X_val_noy.drop(['id','revenue','budget_millions','revenue_millions','ordering_x','ordering_y','roi','vote_average',	'vote_count','popularity','revenue'], axis=1)
#X_train = X.fillna(X.mean())
#[X.select_dtypes(include=['number']).columns] = X.select_dtypes(include=['number']).fillna(X.mean())
# Select numeric columns only
numeric_cols = X_val.select_dtypes(include=['number']).columns

# Fill missing values in numeric columns with the mean of each column
X_val[numeric_cols] = X_val[numeric_cols].fillna(X_val[numeric_cols].mean())
X_val.head()
X_val["actor1Name"].astype("category")
X_val["dir1Name"].astype("category")

9933          Kari Paljakka
9934            Trey Parker
9935         H. Gordon Boos
9936     John Francis Daley
9937                    NaN
                ...        
11680        James McTeigue
11681      Philippe Lacheau
11682    Payut Ngaokrachang
11683       Jacques Audiard
11684          Nancy Meyers
Name: dir1Name, Length: 1752, dtype: category
Categories (1262, object): ['A. Edward Sutherland', 'A. Raja', 'Aanand L. Rai', 'Aaron Blaise', ..., 'Álex de la Iglesia', 'Éric Tessier', 'Ömer Ali Kazma', 'Ülkü Erakalin']

In [79]:
categorical_cols = X_val.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    X_val[col] = label_encoder.fit_transform(X_val[col])
#label_encoder = LabelEncoder()
Y_val_numeric = label_encoder.fit_transform(Y_val)
#xgb = XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss', use_label_encoder=False,enable_categorical=True, random_state=42)
#xgb.fit(X, Y_numeric)
# Must use JSON/UBJSON for serialization, otherwise the information is lost.
#y_train_pred = xgb.predict(X)
#xgb.save_model("XGBoost-model.json")
#xgb.feature_importances_

In [81]:
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],  # eta
    'n_estimators': [50, 100, 200, 300],      # Number of boosting rounds
    'max_depth': [3, 6, 9],                    # Maximum depth of each tree
    'min_child_weight': [1, 3, 5],              # Minimum sum of instance weight in a child
    'subsample': [0.6, 0.8, 1.0],              # Fraction of samples for fitting each tree
    'colsample_bytree': [0.6, 0.8, 1.0],       # Fraction of features used for each tree
    'gamma': [0, 0.1, 0.2],                   # Minimum loss reduction required to make further partitions
    'scale_pos_weight': [1, 2, 5]              # Adjusts weights for imbalanced datasets
}

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, cv=3, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit the model
grid_search.fit(X_val, Y_val_numeric)

# Get the best parameters
print("Best parameters found: ", grid_search.best_params_)

# Get the best score
print("Best accuracy found: ", grid_search.best_score_)

# Evaluate on the test set (optional)
#best_model = grid_search.best_estimator_
#y_pred = best_model.predict(X_test)

Fitting 3 folds for each of 11664 candidates, totalling 34992 fits
Best parameters found:  {'colsample_bytree': 0.8, 'gamma': 0.1, 'learning_rate': 0.2, 'max_depth': 9, 'min_child_weight': 3, 'n_estimators': 100, 'scale_pos_weight': 1, 'subsample': 1.0}
Best accuracy found:  0.9748858447488584


In [83]:
best_model = grid_search.best_estimator_
#y_pred = best_model.predict(X_test)

In [85]:


# List of text columns to keep
columns_to_keep = ['actor1Name','dir1Name','success_level']

# Identify all columns with text (object) data type
text_columns = test_data.select_dtypes(include=['object']).columns

# Drop text columns except the ones in the keep list
columns_to_drop = [col for col in text_columns if col not in columns_to_keep]
test_filtered = test_data.drop(columns=columns_to_drop)

In [99]:
X_test_noy =  test_filtered.drop('success_level', axis=1)
#X_without_text_columns = X_noy.select_dtypes(exclude=['object'])
Y_test = test_data[['success_level']]
Y_test_numeric = label_encoder.fit_transform(Y_test)
#X_without_text_columns.head()
X_test = X_test_noy.drop(['id','revenue','budget_millions','revenue_millions','ordering_x','ordering_y','roi','vote_average',	'vote_count','popularity','revenue'], axis=1)
#X_train = X.fillna(X.mean())
#[X.select_dtypes(include=['number']).columns] = X.select_dtypes(include=['number']).fillna(X.mean())
# Select numeric columns only
numeric_cols = X_test.select_dtypes(include=['number']).columns

# Fill missing values in numeric columns with the mean of each column
X_test[numeric_cols] = X_test[numeric_cols].fillna(X_test[numeric_cols].mean())
X_test.head()
X_test["actor1Name"].astype("category")
X_test["dir1Name"].astype("category")

0                    King Vidor
1                    Aziz Mirza
2                      Bob Rose
3                   Archie Mayo
4                   Shana Feste
                 ...           
1748            Gianluca Leuzzi
1749                        NaN
1750                 Fred Niblo
1751               Renny Harlin
1752    Rakeysh Omprakash Mehra
Name: dir1Name, Length: 1753, dtype: category
Categories (1249, object): ['A. Sarkunam', 'Aaron Norris', 'Abbas Alibhai Burmawalla', 'Abbas Tyrewala', ..., 'Ángel de la Cruz', 'Çetin Inanç', 'Éric Besnard', 'Øystein Stene']

In [103]:
X_test.shape
#Y_test_numeric.shape

(1753, 63)

In [105]:
categorical_cols = X_test.select_dtypes(include=['object']).columns

label_encoder = LabelEncoder()

# Apply LabelEncoder to each categorical column
for col in categorical_cols:
    X_test[col] = label_encoder.fit_transform(X_test[col])

#xgb = XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss', use_label_encoder=False,enable_categorical=True, random_state=42)
#xgb.fit(X, Y_numeric)
# Must use JSON/UBJSON for serialization, otherwise the information is lost.
#y_train_pred = xgb.predict(X)
#xgb.save_model("XGBoost-model.json")
#xgb.feature_importances_

In [107]:
y_test_pred = best_model.predict(X_test)
test_accuracy = accuracy_score(y_test_pred, Y_test_numeric)

print(f"test accuracy: {test_accuracy * 100:.2f}%")

test accuracy: 96.75%


In [130]:
xgb.fit(X_train, Y_numeric)
# Make predictions on the training set
y_train_pred = xgb.predict(X_train)

# Calculate the accuracy on the training data
train_accuracy = accuracy_score(y_train_pred, Y_numeric)

print(f"Training accuracy: {train_accuracy * 100:.2f}%")

Training accuracy: 98.96%


In [None]:
confusion_matrix(y_test, y_pred)