In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer, MinMaxScaler

In [4]:
df = pd.read_csv('data/provided_data_with_budget.csv',
                 index_col=0)
df.drop(index=df.query('DomesticGross==0').index,
        inplace=True)
df.drop('Success', axis=1, inplace=True)

In [5]:
boxcox_production = PowerTransformer(method='box-cox')
boxcox_gross = PowerTransformer(method='box-cox')
# %% loading a minmax scaler for time of the movie
minmax = MinMaxScaler()

# %% loading a PCA to reduce the genre col
pca = PCA(n_components=3)

In [6]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)

In [92]:
data={}
data['runtime']= 0.0
data['production']= 1.0
data['genre']= 1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

In [93]:
boxcox_production.fit( train_df.loc[:, 'ProductionBudget'].values.reshape(-1, 1) )
boxcox_gross.fit( train_df.loc[:, 'DomesticGross'].values.reshape(-1, 1) )
minmax.fit( train_df.loc[:, 'Runtime (Minutes)'].values.reshape(-1, 1) )
pca.fit( train_df.iloc[:, 5:25] )

data['runtime']= minmax.transform( np.array( data['runtime'] ).reshape(1, -1) )[0]
data['production']= boxcox_production.transform( np.array( data['production']).reshape(1, -1) )[0]
data['c1'], data['c2'], data['c3'] = zip( *pca.transform( np.array( data['genre']).reshape(1, -1) ))



In [94]:
del data['genre']

In [167]:
data

{'runtime': array([-0.54545455]),
 'production': array([-3.76939142]),
 'c1': (0.37021620390228055,),
 'c2': (-0.28834081365018527,),
 'c3': (0.0038748148703996415,)}

In [213]:
np.array( data )

array({'runtime': array([0.28099174]), 'production': array([-2.13846619]), 'c1': (0.3099110330230638,), 'c2': (-0.14651587136887356,), 'c3': (0.33874869231061694,)},
      dtype=object)

In [95]:
pd.DataFrame(data)

Unnamed: 0,runtime,production,c1,c2,c3
0,-0.545455,-3.769391,0.370216,-0.288341,0.003875


# create a function to fill other genres


In [99]:
genres= train_df.columns[5:25].values

In [100]:
genres

array(['Action', 'Adventure', 'Aniimation', 'Biography', 'Comedy',
       'Crime', 'Drama', 'Family', 'Fantasy', 'History', 'Horror',
       'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport',
       'Thriller', 'War', 'Western'], dtype=object)

In [109]:
given= ['Action','War']

In [150]:

idxs= [ np.where(genres == g )[0].tolist()[0] for g in given ] 

In [151]:
idxs

[0, 18]

In [143]:
genres[idxs]

array(['Action', 'War'], dtype=object)

In [146]:
genre_ip= np.zeros_like(genres)

In [148]:
genre_ip[idxs]=1

In [149]:
genre_ip

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=object)

In [161]:
def full_genres( given  ):
    
    genres= np.array( ['action', 'adventure', 'aniimation', 'biography', 'comedy',
       'crime', 'drama', 'family', 'fantasy', 'history', 'horror',
       'music', 'musical', 'mystery', 'romance', 'sci-fi', 'sport',
       'thriller', 'war', 'western'] )
    genre_ip= np.zeros_like(genres, dtype=int)
    
    idx= [ np.where(genres == g )[0].tolist()[0] for g in given ] # returns the idx for the given genres
    
    genre_ip[idx]=1
    
    return genre_ip

In [162]:
given= ['Musical' , 'Drama']

full_genres( given )

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0])

# let's put this all together 

In [225]:
def full_genres( given ):  
    genres= np.array( ['action', 'adventure', 'aniimation', 'biography', 'comedy',
       'crime', 'drama', 'family', 'fantasy', 'history', 'horror',
       'music', 'musical', 'mystery', 'romance', 'sci-fi', 'sport',
       'thriller', 'war', 'western'] )
    genre_ip= np.zeros_like(genres, dtype=int)
    
    idx= [ np.where(genres == g.lower() )[0].tolist()[0] for g in given ] # returns the idx for the given genres
    
    genre_ip[idx]=1
    
    return genre_ip

def predict_gross( data ):

    df = pd.read_csv('data/provided_data_with_budget.csv',
                 index_col=0)
    df.drop(index=df.query('DomesticGross==0').index,
            inplace=True)
    df.drop('Success', axis=1, inplace=True)
    df, _ = train_test_split(df, test_size=0.2, random_state=0)
    
    boxcox_production = PowerTransformer(method='box-cox')
    boxcox_gross = PowerTransformer(method='box-cox')
    minmax = MinMaxScaler()
    pca = PCA(n_components=3)
    
    boxcox_production.fit( df.loc[:, 'ProductionBudget'].values.reshape(-1, 1) )
    boxcox_gross.fit( df.loc[:, 'DomesticGross'].values.reshape(-1, 1) )
    minmax.fit( df.loc[:, 'Runtime (Minutes)'].values.reshape(-1, 1) )
    pca.fit( df.iloc[:, 5:25] )
    
    model= RandomForestRegressor(10)
    
    df= pd.read_csv('S:/Movie_success_predictor/data/train_budget_numerical.csv', index_col=0 )
    model.fit( X=df.drop('DomesticGross', axis=1),  y=df.DomesticGross)
        
    data['runtime']= minmax.transform( np.array( data['runtime'] ).reshape(1, -1) )[0]
    data['production']= boxcox_production.transform( np.array( data['production']).reshape(1, -1) )[0]
    data['c1'], data['c2'], data['c3'] = zip( *pca.transform( full_genres( data['genre'] ).reshape(1, -1).tolist() ) )
    
    #zip( *pca.transform( np.array( data['genre']).reshape(1, -1) ))
    
    del data['genre']
    
    return boxcox_gross.inverse_transform( model.predict( pd.DataFrame(data)).reshape(1, -1)  ).tolist()[0][0]

In [226]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer, MinMaxScaler
    
data={}
data['runtime']= input('Enter runtime\n')
data['production']= input('Enter production price\n')
print('\nEnter genres carefully from here: \naction, adventure, aniimation, biography,\ncomedy, crime, drama, family, \nfantasy, history, horror, musis, \nmusical, mystery, romance, sci-fi, \nsport,thriller, war, western\n' )
data['genre']= [input('Enter genre\n') for i in range(3)]

print( f'\nPredicted gross= {np.floor(predict_gross(data))} $')

Enter runtime
 140
Enter production price
 6000000



Enter genres carefully from here: 
action, adventure, aniimation, biography,
comedy, crime, drama, family, 
fantasy, history, horror, musis, 
musical, mystery, romance, sci-fi, 
sport,thriller, war, western



Enter genre
 horror
Enter genre
 drama
Enter genre
 crime



Predicted gross= 6736912.116856329 $
