In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import f_classif
from sklearn.preprocessing import OrdinalEncoder

from sklearn.preprocessing import PowerTransformer
from sklearn.compose import TransformedTargetRegressor

from sklearn.model_selection import train_test_split

from sklearn.linear_model import HuberRegressor, LinearRegression
from sklearn import metrics

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Goal: 

Predict the planned cost (budget) based on the characteristics of the production ('company','country','director','genre','runtime', 'year','rating','star','writer')

# 2- EDA (Exploratory Data Analysis) and Data Wrangling 



In [None]:
dataset = pd.read_csv('/kaggle/input/movies/movies.csv',encoding='latin1')
dataset.sample(50)

In [None]:
dataset.info()

## 2.1- Eliminating irrelevant features by empirical inference

*   The name is information that obviously has no influence on the budget
*   The score, votes and gross are information that cannot be obtained for future predictions
*   'Released' is a date containing year, month and day, the year can explain some trend as well as the month of production. Let's delete the year and day of the feature realease and keep only the month.













In [None]:
dataset_fs = dataset.drop(['name','score','gross', 'votes'],axis=1)
dataset_fs.info()

In [None]:
dataset_fs['released'] = pd.to_datetime(dataset_fs['released'])
dataset_fs['released']=(dataset_fs['released'].dt.month).astype('object')

In [None]:
dataset_fs.info()

## 2.2- Eliminating records with budget equal to '0'

In [None]:
dataset_fs[dataset_fs['budget']==0.0]= np.nan
dataset_fs=dataset_fs.dropna()

In [None]:
dataset_fs.sample(10)

## 2.3- Checking linear correlation of numeric features

In [None]:
plt.figure(figsize=(10,7))
sns.heatmap(dataset_fs.corr(),annot=True)

In [None]:
sns.pairplot(dataset_fs[['budget','runtime','year']])

It is possible to verify a positive average trend line for budget x runtime and budget x year which has a heteroscedastic distribution. These features can be better absorbed by the model from others features (interaction effects). After the grouping of low frequency categories for high cardinality features (item 2.5), we will be able to validate better.

 The budget and runtime distribution needs to undergo a transformation to fit a normal symmetrical curve and treat autliers more efficiently. We will do this after the split of the training and test base to avoid data leaking, where the target distribution will be treated with a Yeo-Johnson transformation and the feature runtime with a box-cox transformation.

## 2.4- Treating outliers with IIQ (because it is a very aggressive method and it ended up damaging the model and will be treated with a Yeo-Johnson and box-cox transformation)


In [None]:
dataset_fs['budget'].plot(kind='box')

In [None]:
def exclui_outliers(DataFrame, col_name):
  Q1 = DataFrame[col_name].quantile(.25)
  Q3 = DataFrame[col_name].quantile(.75)
  IIQ =Q3 -Q1
  limite_inf = Q1 -1.5*IIQ
  limite_sup = Q3 +1.5*IIQ
  
  return DataFrame[(DataFrame[col_name]>=limite_inf) & (DataFrame[col_name]<=limite_sup)]

In [None]:
#dataset_fs = exclui_outliers(dataset_fs, 'budget')
#dataset_fs['budget'].plot(kind='box')

## 2.5- Checking strength of association between numeric target and categorical features

### Functions

In [None]:
def group_low_freq_cats(DataFrame, col_name, threshold=0.01, name='others'):
  df = DataFrame.copy()
  cat_freq = df[col_name].value_counts()
  cat_low_freq = cat_freq[cat_freq/cat_freq.sum() <= threshold].index
  df.loc[df[col_name].isin(cat_low_freq),col_name]='others'
  return df

In [None]:
def val_couts_cols (Dataframe,cols):
  for x in cols:
    print('coluna: {0}, categorias: {1}'.format(x,len(Dataframe[x].value_counts())))
  print('Total samples: ' + str(len(Dataframe)))

In [None]:
def feature_selection(Dataset, feature, target ,in_out, method='na'): 
  fs_score =[]
  oe = OrdinalEncoder()

  X = (np.array(Dataset[feature])).reshape(-1,1)
  oe.fit(X)
  X_enc = oe.transform(X)

  y = np.array(Dataset[target]).reshape(-1,1)
  oe.fit(y)
  y_enc = oe.transform(y)
  
  if in_out == 'cat_cat': 
    if method == 'chi2':
      fs = SelectKBest(score_func=chi2, k='all') 
    else:
      fs = SelectKBest(score_func=mutual_info_classif, k='all')
    fs.fit(X_enc, y_enc)
    fs_score = fs.scores_
  elif in_out == 'num_num':
    fs = SelectKBest(score_func=f_regression, k='all')
    fs.fit(X, y.ravel())
    fs_score = fs.scores_
  elif in_out == 'num_cat':
    fs = SelectKBest(score_func=f_classif, k='all')
    fs.fit(X, y_enc)
    fs_score = fs.scores_
  elif in_out == 'cat_num':
    fs = SelectKBest(score_func=f_classif, k='all')
    fs.fit(X_enc, y.ravel())
    fs_score = fs.scores_
  else:
    fs_score=[]

  return fs_score

In [None]:
def get_col_type(df,col_type):
  cols_types=df.dtypes.reset_index()
  cols_types.columns=['col','type']
  cols_type = cols_types.apply(lambda x: x['col'] if x['type']==col_type else np.nan ,axis=1)
  return cols_type.dropna()

In [None]:
def boxplot_by_col(df,cat_cols,target):
  fig, ax = plt.subplots(len(cat_cols), 1, figsize=(25, 18))
  fig.subplots_adjust()
  t=0
  for var, subplot in zip(cat_cols, ax.flatten()):
      ax[t].set_xlabel(var,fontsize=18)
      sort_qtl_index = df.groupby(var)[target].quantile(0.5).sort_values().index
      sort_qtl_values = df.groupby(var)[target].quantile(0.5).sort_values()
      sns.boxplot(x=var, y=target, data=df, ax=subplot,order=sort_qtl_index)
      sns.pointplot(x=sort_qtl_index,y= sort_qtl_values,ax=subplot,color='r')
      t+=1    
  plt.tight_layout(pad=3) 

In [None]:
def remove_incoherence(DataFrame,expression, replace_val, columns=[]):
  if len(columns)==0:
    columns = DataFrame.columns
  
  DataFrame_aux=DataFrame.copy()
  
  if str(replace_val) == str(np.nan):
    DataFrame_aux=DataFrame.replace(expression, replace_val, regex=True) # não usar str.replace pois não aceita np.nan
    return DataFrame_aux
  else: 
    for col in columns:
      i=0
      while (True): # quando trabalhamos com grupos no regex, ele não é capaz de substituir todos os grupos, então é necessario iterar a cada nova substituição
        DataFrame_aux[col]=DataFrame[col].str.replace(expression, replace_val, regex=True)
        #warnings.filterwarnings('ignore','UserWarning') # para evitar warning quando str.contains chamar expressions contendo groups que não serão utilizados
        num_matchs = len(DataFrame_aux[DataFrame_aux[col].str.contains(expression, na=False)])#  verifica se regex funcionou, caso sim retorna 0, senão retorna o numero de matchs
        DataFrame = DataFrame_aux
        
        if num_matchs == 0:
            break
        if i == 100:
            DataFrame_aux =pd.DataFrame([])
            break
        i+=1
    return DataFrame_aux

### Obtaining categorical features

In [None]:
cat_cols = get_col_type(dataset_fs, 'object')
cat_cols

### Count categories by feature

In [None]:
val_couts_cols(dataset_fs,cat_cols)

### Association strength feature x target (ANOVA F-value)

In [None]:
fs_scores =[]
for x in cat_cols:
  fs_score = feature_selection(dataset_fs, x, 'budget','cat_num')
  print('coluna: {0}, fs_score: {1}'.format(x,fs_score))
  fs_scores.append(fs_score)

In [None]:
np.mean(fs_scores)

### Frequency analysis for features with high cardinality

In [None]:
dataset_fs['company'].value_counts()

In [None]:
#(group_low_freq(dataset_fs,'company',threshold=0.005))['company'].value_counts()
dataset_fs = group_low_freq_cats(dataset_fs,'company',threshold=0.005) # group categories that represent less than 0.5% of the data set in a single 'outhers' category
dataset_fs['company'].value_counts()

In [None]:
feature_selection(dataset_fs, 'company', 'budget','cat_num')

The reduction of categories did not change the fs_score.

In [None]:
dataset_fs['director'].value_counts()

In [None]:
(group_low_freq_cats(dataset_fs,'director',threshold=0.005))['director'].value_counts()

Basically a constant in the 'others' category and will be eliminated

In [None]:
dataset_fs=dataset_fs.drop('director',axis=1)

In [None]:
dataset_fs['star'].value_counts()

In [None]:
(group_low_freq_cats(dataset_fs,'star',threshold=0.005))['star'].value_counts()

Basically a constant in the 'others' category and will be eliminated

In [None]:
dataset_fs=dataset_fs.drop('star',axis=1)

In [None]:
(group_low_freq_cats(dataset_fs,'writer',threshold=0.005))['writer'].value_counts()

Basically a constant in the 'others' category and will be eliminated

In [None]:
dataset_fs=dataset_fs.drop('writer',axis=1)

Conclusion: Basically all invoices with fs_score below the average were eliminated (except for countries that already had a low number of categories)

### Excluding feature rating inconsistencies

In [None]:
dataset_fs=remove_incoherence(dataset_fs,r'UNRATED|NOT RATED|Not specified', 'Others', columns=['rating'])

### Graphically analyzing categorical feature x target distribution

In [None]:
cat_cols = get_col_type(dataset_fs, 'object')
cat_cols

In [None]:
val_couts_cols(dataset_fs,cat_cols)

In [None]:
boxplot_by_col(dataset_fs,cat_cols,'budget')

We can observe a slight variation in relation to the medians of these features x target, 
possibly these features positively affect features with heteroscedastic distribution. Let's analyze:

* Company




In [None]:
dataset_fs['company'].unique()

In [None]:
df =dataset_fs.copy()
df= df[(df['company']=='Universal Pictures') & (dataset_fs['genre']=='Comedy') ]
sns.scatterplot(x=df['year'],y=df['budget'],hue=df['company'])

As you can see, there is a correlation (with some level of dispersion) between year x budget when we isolate these features from other features like company and genre.

* Country

In [None]:
dataset_fs['country'].unique()

In [None]:
df =dataset_fs.copy()
df= dataset_fs[(dataset_fs['country']=='USA') & (dataset_fs['company']=='Walt Disney Pictures') ]
sns.scatterplot(x=df['year'],y=df['budget'],hue=df['company'])

In [None]:
df =dataset_fs.copy()
df= dataset_fs[(dataset_fs['country']=='USA') & (dataset_fs['company']=='Warner Bros.') ]
sns.scatterplot(x=df['year'],y=df['budget'],hue=df['company'])

Even if restricted by other features, for the most frequent category (USA), the data remains very dispersed, if this feature impairs the performance of the model it will be eliminated.

* Genre

In [None]:
dataset_fs['genre'].unique()

In [None]:
df =dataset_fs.copy()
df= dataset_fs[(dataset_fs['genre']=='Action') & (dataset_fs['rating']=='PG-13')]
sns.scatterplot(x=df['runtime'],y=df['budget'],hue=df['rating'])

In [None]:
df =dataset_fs.copy()
df= dataset_fs[(dataset_fs['genre']=='Biography') & (dataset_fs['rating']=='R') ]
sns.scatterplot(x=df['runtime'],y=df['budget'],hue=df['rating'])

As you can see, there is a well-defined correlation between runtime x budget when we isolate these features from genre and rating.

## 2.6- Dummies for categorical features

In [None]:
dataset_fs_f=dataset_fs.copy()

In [None]:
#as noted in the previous analysis, a significant improvement in the R2 score was expected when dropping this feature
dataset_fs_f = dataset_fs_f.drop(['country'],axis=1) 

In [None]:
company_dummies = pd.get_dummies(dataset_fs_f['company'],drop_first=True)
dataset_fs_f=pd.concat([dataset_fs_f.drop('company',axis=1),company_dummies],axis=1)

In [None]:
genre_dummies = pd.get_dummies(dataset_fs_f['genre'],drop_first=True)
dataset_fs_f=pd.concat([dataset_fs_f.drop('genre',axis=1),genre_dummies],axis=1)

In [None]:
rating_dummies = pd.get_dummies(dataset_fs_f['rating'],drop_first=True)
dataset_fs_f=pd.concat([dataset_fs_f.drop('rating',axis=1),rating_dummies],axis=1)

In [None]:
month_dummies = pd.get_dummies(dataset_fs_f['released'],drop_first=True)
dataset_fs_f=pd.concat([dataset_fs_f.drop('released',axis=1),month_dummies],axis=1)

In [None]:
dataset_fs_f.info()

# 3- Spliting Dataset train/test



In [None]:
X = dataset_fs_f.drop('budget',axis=1).values
X.shape

In [None]:
dataset_fs_f.info()

In [None]:
y = dataset_fs_f['budget'].values
y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=31, test_size=0.20)

# 4- 'box-cox' transformation (features)

Transformation of the data based on the training data and the parameters of this distribution are applied in the test

In [None]:
def plot_hists_scatters(*args,cols=['none'],type_plot='scatter',target=[]):
  
  if np.array_equal(target,[]) & (type_plot  == 'scatter'):
    print('No target')
  elif len(args)==1:
    if type_plot  == 'scatter':
      plt.title(cols[0],fontsize=18)
      sns.scatterplot(x=args[0],y=target)
    else:
      plt.title(cols[0],fontsize=18)
      sns.histplot(args[0])
  else:
    fig, ax = plt.subplots(1, len(args), figsize=(10, 4))
    t=0
    for arg, subplot in zip(args,ax.flatten()):  
      if type_plot == 'hist':
        if len(cols) == 1:
          ax[t].set_title(cols[0],fontsize=18)
        else:
          ax[t].set_title(cols[t],fontsize=18)
        sns.histplot(arg, ax=subplot)
      else:
        if len(cols) == 1:
          ax[t].set_title(cols[0],fontsize=18)
        else:
          ax[t].set_title(cols[t],fontsize=18)
        sns.scatterplot(x=arg,y=target, ax=subplot)
      t+=1
    plt.tight_layout(pad=3) 

In [None]:
norm_box = PowerTransformer(method='box-cox') #runtime feture transformation
X_train[:,0]= norm_box.fit_transform(X_train[:,0].reshape(-1, 1)).ravel() 
X_test[:,0] = norm_box.transform(X_test[:,0].reshape(-1, 1)).ravel()

In [None]:
plot_hists_scatters(X_train[:,0],X_test[:,0],type_plot='hist',cols=['runtime (train)', 'runtime (test)'])

# 5- Training
 




## 5.1- Linear regression with yeo-johnson transformation of target (sklearn) - Best result

In [None]:
regr = TransformedTargetRegressor(regressor=LinearRegression(),transformer=PowerTransformer(method='yeo-johnson')) 
regr.fit(X_train,y_train)

In [None]:
#regr = TransformedTargetRegressor(regressor=HuberRegressor(),transformer=PowerTransformer())
#regr.fit(X_train,y_train)

## 5.2- Deep Learning (keras) - low results

In [None]:
#from tensorflow.keras.models import Sequential
#from tensorflow.keras.layers import Input,Dense,Dropout

In [None]:
'''def r_square(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square(y_true - y_pred)) 
    SS_tot = K.sum(K.square(y_true - K.mean(y_true))) 
    return ( 1 - SS_res/(SS_tot + K.epsilon()) ) '''

In [None]:
'''model = Sequential()
model.add(Input((X.shape[1],)))
model.add(Dense(64,activation='relu'))
model.add(Dense(64,activation='relu'))
model.add(Dense(1))

model.compile(optimizer='adam',loss='mse',metrics=['mae',r_square])'''

In [None]:
#model.fit(x=X_train,y=y_train, validation_data=(X_test,y_test),epochs=100 )
#losses = pd.DataFrame(model.history.history)

In [None]:
#losses.plot(y=['mae','val_mae'])

In [None]:
#losses.plot(y=['r_square'])

In [None]:
#losses.tail(1)


## Gradient Boosting Regressor with yeo-johnson transformation of target (sklearn) - Result slightly less than linear regression

In [None]:
# from sklearn.ensemble import GradientBoostingRegressor

In [None]:
#regr = TransformedTargetRegressor(regressor=GradientBoostingRegressor(),transformer=PowerTransformer(method='yeo-johnson')) # Melhor performance, utilizando Transformação 
#regr.fit(X_train,y_train)

# 6- Results

## 6.1- Metrics

In [None]:
y_pred = regr.predict(X_test)
mae = metrics.mean_absolute_error(y_test, y_pred)

print('R2_score train: ',regr.score(X_train,y_train))
print('R2_score test: ', regr.score(X_test,y_test))
print('MAE:', mae)

In [None]:
result = pd.DataFrame([y_test,y_pred.astype('f')]).T
result.columns=['Test','Predicted']
result.head(4)

## 6.2- Conclusion

Eliminating categorical features with low strength of association with the target, as well as numerical features of low correlation in addition to data transformation, an R2 score of 0.55 (test) was obtained, which is a reasonable value considering the points below.

Possible points for improvement:

- As we are considering the budget and not the realized one, the target data is data that does not reflect the actual spending of the project and is possibly inserting a lot of noise (estimates with a certain degree of randomness since it was 'kicked' by the budgeter). 

- Obtain other features with greater association strength or high correlation with target.