In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer, MinMaxScaler

In [2]:
df = pd.read_csv('D:\projects\Movies_success_predictor\data\movie_success_rate_with_budgets.csv',
                 index_col=0)
df.drop(index=df.query('DomesticGross==0').index,
        inplace=True)
df.drop('Success', axis=1, inplace=True)

In [3]:
boxcox_production = PowerTransformer(method='box-cox')
boxcox_gross = PowerTransformer(method='box-cox')
# %% loading a minmax scaler for time of the movie
minmax = MinMaxScaler()

# %% loading a PCA to reduce the genre col
pca = PCA(n_components=3)

In [4]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=0)

In [5]:
data={}
data['runtime']= 0.0
data['production']= 1.0
data['genre']= 1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

In [6]:
boxcox_production.fit( train_df.loc[:, 'ProductionBudget'].values.reshape(-1, 1) )
boxcox_gross.fit( train_df.loc[:, 'DomesticGross'].values.reshape(-1, 1) )
minmax.fit( train_df.loc[:, 'Runtime (Minutes)'].values.reshape(-1, 1) )
pca.fit( train_df.iloc[:, 5:25] )

data['runtime']= minmax.transform( np.array( data['runtime'] ).reshape(1, -1) )[0]
data['production']= boxcox_production.transform( np.array( data['production']).reshape(1, -1) )[0]
data['c1'], data['c2'], data['c3'] = zip( *pca.transform( np.array( data['genre']).reshape(1, -1) ))

In [19]:
import pickle
pickle.dump(boxcox_production, open('production_scale.sav', 'wb' ))
pickle.dump(boxcox_gross, open('gross_scale.sav', 'wb' ))
pickle.dump(minmax, open('runtime_scale.sav', 'wb' ))
pickle.dump(pca, open('genres_pca.sav', 'wb' ))

In [9]:
del data['genre']

In [10]:
data

{'runtime': array([-0.54545455]),
 'production': array([-3.76939142]),
 'c1': (-217272.22650070605,),
 'c2': (-29.11028045247712,),
 'c3': (-115.03716876610791,)}

In [11]:
np.array( data )

array({'runtime': array([-0.54545455]), 'production': array([-3.76939142]), 'c1': (-217272.22650070605,), 'c2': (-29.11028045247712,), 'c3': (-115.03716876610791,)},
      dtype=object)

In [12]:
pd.DataFrame(data)

Unnamed: 0,runtime,production,c1,c2,c3
0,-0.545455,-3.769391,-217272.226501,-29.11028,-115.037169


# create a function to fill other genres


In [13]:
genres= train_df.columns[10:30].values

In [14]:
genres

array(['Action', 'Adventure', 'Aniimation', 'Biography', 'Comedy',
       'Crime', 'Drama', 'Family', 'Fantasy', 'History', 'Horror',
       'Music', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Sport',
       'Thriller', 'War', 'Western'], dtype=object)

In [15]:
given= ['Action','War']

In [16]:
np.where(genres == 'War' )

(array([18], dtype=int64),)

In [35]:
idxs= [ np.where(genres == g )[0].tolist()[0] for g in given ]

In [20]:
pickle.dump(genres, open('genres_vector.sav', 'wb' ))

In [36]:
idxs

[0, 18]

In [37]:
genres[idxs]

array(['Action', 'War'], dtype=object)

In [38]:
genre_ip= np.zeros_like(genres)

In [39]:
genre_ip[idxs]=1

In [40]:
genre_ip

array([1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
      dtype=object)

In [45]:
def full_genres( given  ):
    give= [cat.lower() for cat in given]
    genres= np.array( ['action', 'adventure', 'aniimation', 'biography', 'comedy',
       'crime', 'drama', 'family', 'fantasy', 'history', 'horror',
       'music', 'musical', 'mystery', 'romance', 'sci-fi', 'sport',
       'thriller', 'war', 'western'] )
    genre_ip= np.zeros_like(genres, dtype=int)
    
    idx= [ np.where(genres == g )[0].tolist()[0] for g in given ] # returns the idx for the given genres
    
    genre_ip[idx]=1
    
    return genre_ip

In [46]:
given= ['Musical' , 'Drama']

full_genres( given )

IndexError: list index out of range

# let's put this all together 

In [21]:
def full_genres( given ):  
    genres= np.array( ['action', 'adventure', 'aniimation', 'biography', 'comedy',
       'crime', 'drama', 'family', 'fantasy', 'history', 'horror',
       'music', 'musical', 'mystery', 'romance', 'sci-fi', 'sport',
       'thriller', 'war', 'western'] )
    genre_ip= np.zeros_like(genres, dtype=int)
    
    idx= [ np.where(genres == g.lower() )[0].tolist()[0] for g in given ] # returns the idx for the given genres
    
    genre_ip[idx]=1
    
    return genre_ip

def predict_gross( data ):

    df = pd.read_csv('D:\projects\Movies_success_predictor\data\provided_data_with_budget.csv',
                 index_col=0)
    df.drop(index=df.query('DomesticGross==0').index,
            inplace=True)
    df.drop('Success', axis=1, inplace=True)
    df, _ = train_test_split(df, test_size=0.2, random_state=0)
    
    boxcox_production = PowerTransformer(method='box-cox')
    boxcox_gross = PowerTransformer(method='box-cox')
    minmax = MinMaxScaler()
    pca = PCA(n_components=3)
    
    boxcox_production.fit( df.loc[:, 'ProductionBudget'].values.reshape(-1, 1) )
    boxcox_gross.fit( df.loc[:, 'DomesticGross'].values.reshape(-1, 1) )
    minmax.fit( df.loc[:, 'Runtime (Minutes)'].values.reshape(-1, 1) )
    pca.fit( df.iloc[:, 5:25] )
    
    model= RandomForestRegressor(10)
    
    df= pd.read_csv('D:/projects/Movies_success_predictor/data/train_budget_numerical.csv', index_col=0 )
    model.fit( X=df.drop('DomesticGross', axis=1),  y=df.DomesticGross)
    pickle.dump(model, open('model.sav', 'wb' ))
    data['runtime']= minmax.transform( np.array( data['runtime'] ).reshape(1, -1) )[0]
    data['production']= boxcox_production.transform( np.array( data['production']).reshape(1, -1) )[0]
    data['c1'], data['c2'], data['c3'] = zip( *pca.transform( full_genres( data['genre'] ).reshape(1, -1).tolist() ) )
    
    #zip( *pca.transform( np.array( data['genre']).reshape(1, -1) ))

    del data['genre']
    
    return boxcox_gross.inverse_transform( model.predict( pd.DataFrame(data)).reshape(1, -1)  ).tolist()[0][0]

In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.decomposition import PCA
from sklearn.preprocessing import PowerTransformer, MinMaxScaler
    
data={}
data['runtime']= input('Enter runtime\n')
data['production']= input('Enter production price\n')
print('\nEnter genres carefully from here: \naction, adventure, aniimation, biography,\ncomedy, crime, drama, family, \nfantasy, history, horror, musis, \nmusical, mystery, romance, sci-fi, \nsport,thriller, war, western\n' )
data['genre']= [input('Enter genre\n') for i in range(3)]

print( f'\nPredicted gross= {np.floor(predict_gross(data))} $')


Enter genres carefully from here: 
action, adventure, aniimation, biography,
comedy, crime, drama, family, 
fantasy, history, horror, musis, 
musical, mystery, romance, sci-fi, 
sport,thriller, war, western



IndexError: list index out of range