In [2]:
!pip install pycaret

Collecting pycaret
  Using cached pycaret-2.3.1-py3-none-any.whl (261 kB)
Collecting wordcloud
  Using cached wordcloud-1.8.1-cp38-cp38-manylinux1_x86_64.whl (371 kB)
Collecting textblob
  Using cached textblob-0.15.3-py2.py3-none-any.whl (636 kB)
Collecting scikit-learn==0.23.2
  Using cached scikit_learn-0.23.2-cp38-cp38-manylinux1_x86_64.whl (6.8 MB)
Collecting pyLDAvis
  Using cached pyLDAvis-3.3.1-py2.py3-none-any.whl
Collecting Boruta
  Using cached Boruta-0.3-py3-none-any.whl (56 kB)
Collecting pyod
  Using cached pyod-0.8.8-py3-none-any.whl
Collecting kmodes>=0.10.1
  Using cached kmodes-0.11.0-py2.py3-none-any.whl (18 kB)
Collecting mlflow
  Using cached mlflow-1.17.0-py3-none-any.whl (14.2 MB)
Collecting numpy==1.19.5
  Using cached numpy-1.19.5-cp38-cp38-manylinux2010_x86_64.whl (14.9 MB)
Collecting yellowbrick>=1.0.1
  Using cached yellowbrick-1.3.post1-py3-none-any.whl (271 kB)
Collecting umap-learn
  Using cached umap_learn-0.5.1-py3-none-any.whl
Collecting pandas-profili

Collecting multimethod==1.4
  Using cached multimethod-1.4-py2.py3-none-any.whl (7.3 kB)
Collecting imagehash
  Using cached ImageHash-4.2.0-py2.py3-none-any.whl (295 kB)
Collecting scipy<=1.5.4
  Downloading scipy-1.5.4-cp38-cp38-manylinux1_x86_64.whl (25.8 MB)
[K     |████████████████████████████████| 25.8 MB 2.2 MB/s eta 0:00:01
[?25hCollecting retrying>=1.3.3
  Using cached retrying-1.3.3-py3-none-any.whl
Collecting typing-extensions>=3.7.4.3
  Downloading typing_extensions-3.10.0.0-py3-none-any.whl (26 kB)
Collecting blis<0.8.0,>=0.4.0
  Using cached blis-0.7.4-cp38-cp38-manylinux2014_x86_64.whl (9.8 MB)
Collecting murmurhash<1.1.0,>=0.28.0
  Using cached murmurhash-1.0.5-cp38-cp38-manylinux2014_x86_64.whl (20 kB)
Collecting preshed<3.1.0,>=3.0.2
  Using cached preshed-3.0.5-cp38-cp38-manylinux2014_x86_64.whl (130 kB)
Collecting srsly<1.1.0,>=1.0.2
  Using cached srsly-1.0.5-cp38-cp38-manylinux2014_x86_64.whl (186 kB)
Collecting cymem<2.1.0,>=2.0.2
  Using cached cymem-2.0.5-cp3

In [3]:
import argparse
import numpy as np
import pandas as pd
import ast
from pycaret import *

In [1]:
def text_to_dict(df):
    dict_columns = ['belongs_to_collection', 'genres', 'spoken_languages', 'production_companies',
                    'production_countries', 'Keywords', 'cast', 'crew']
    for columns in dict_columns:
        df[columns] = df[columns].apply(lambda x: {} if pd.isna(x) else ast.literal_eval(x))
    return df

def preprocess(df):
    dict_columns = ['belongs_to_collection', 'genres', 'spoken_languages', 'production_companies',
                    'production_countries', 'Keywords', 'cast', 'crew']

    df = text_to_dict(df)
    df.release_date = pd.to_datetime(df.release_date)
    df['release_year'] = df.release_date.apply(lambda x: x.year)
    df['release_month'] = df.release_date.apply(lambda x: x.month)
    df['director'] = df.crew.apply(lambda x: [y['name'] for y in x if y['job'] == 'Director'])
    df['director'] = df['director'].apply(lambda x: x[0] if len(x) > 0 else None)
    df["budget"] = df["budget"].apply(lambda x: x if x != 0 else None)
    genres = {'Action','Adventure','Animation','Comedy','Crime','Documentary','Drama','Family','Fantasy','History'
        ,'Horror','Music','Mystery','Romance','Science Fiction','TV Movie','Thriller','War','Western'}
    for genre in genres:
        df[genre] = 0
    for index, row in df.iterrows():
        row_genres = row["genres"]
        row_genres = [x["name"] for x in row_genres]
        for genre in genres:
            if genre in row_genres:
                df.at[index, genre] = 1
    df.drop(["genres"], axis='columns', inplace=True)
    cols_to_drop = ['backdrop_path', 'poster_path', 'homepage', 'popularity', 'imdb_id', 'video', 'status'
        , 'original_title', 'overview', 'production_companies', 'production_countries', 'spoken_languages',
                    'title', 'tagline', 'Keywords', 'cast', 'crew']
    df.drop(cols_to_drop, axis='columns', inplace=True)


    return df

In [5]:
train_data = preprocess(pd.read_csv("train.tsv", sep="\t"))
test_data = preprocess(pd.read_csv("test.tsv", sep="\t"))

In [5]:
pycar = setup(data=train_data, test_data=test_data, target='revenue',)

Unnamed: 0,Description,Value
0,session_id,6468
1,Target,revenue
2,Original Data,"(5215, 30)"
3,Missing Values,True
4,Numeric Features,5
5,Categorical Features,23
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(5215, 3372)"


In [6]:
top3 = compare_models(n_select = 3)
print(top3)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
lightgbm,Light Gradient Boosting Machine,29103785.9281,4125631795408029.5,63450141.9869,0.7902,2.5124,17712.863,0.254
rf,Random Forest Regressor,28683905.058,4560845311887492.0,66520044.8785,0.773,2.2999,12421.0607,13.605
et,Extra Trees Regressor,29085808.6639,4629244939421503.0,67127415.465,0.7714,2.239,10069.5311,25.13
gbr,Gradient Boosting Regressor,32419101.6324,4653245940349745.0,67377404.1738,0.7658,2.7943,27572.3845,4.922
ridge,Ridge Regression,41055939.6,5390658657740390.0,72737720.4,0.7279,3.0911,67578.9858,0.398
llar,Lasso Least Angle Regression,38707742.1668,5661332348953735.0,74574134.7624,0.7132,3.0759,59501.0599,0.76
en,Elastic Net,41029427.2,5691858808628838.0,74874536.0,0.7105,3.1579,71636.9272,7.659
lr,Linear Regression,42118950.0,5880088944757965.0,76135064.8,0.7004,3.1825,71880.2372,11.442
omp,Orthogonal Matching Pursuit,40304508.7901,7051740350689754.0,83083218.0672,0.6476,2.9816,50126.5649,0.567
lasso,Lasso Regression,47606966.4,7299539102020403.0,84783821.6,0.6311,3.1485,71600.5837,6.629


[LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              importance_type='split', learning_rate=0.1, max_depth=-1,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
              random_state=6468, reg_alpha=0.0, reg_lambda=0.0, silent=True,
              subsample=1.0, subsample_for_bin=200000, subsample_freq=0), RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=-1, oob_score=False,
                      random_state=6468, verbose=0, warm_start=False), ExtraTreesRegressor(bootstrap=False, ccp_alpha=0.

In [7]:
tuned_models = [tune_model(m) for m in top3]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,30337012.3364,4422992520186097.0,66505582.6242,0.8183,2.1391,4938.2216
1,28112779.1872,4408985989869383.0,66400195.7066,0.7316,2.5243,10473.7285
2,26770701.9956,3127297646303049.0,55922246.4347,0.8103,2.5829,20742.5734
3,34089485.41,6105204990553132.0,78135811.1915,0.8142,2.632,20780.2411
4,30918473.2771,4657890117174237.0,68248737.1105,0.8019,2.3348,9791.0642
5,33106259.9776,6597600770447728.0,81225616.467,0.7438,2.6275,18460.5834
6,29159417.9752,3879593718463856.0,62286384.6957,0.783,2.6127,12417.4209
7,31303089.1305,3535698522860145.0,59461739.9919,0.6924,2.7697,13551.8918
8,28466037.1435,2890436381779827.0,53762778.7766,0.8411,2.7715,75038.3652
9,27957174.718,2536937570478373.0,50368021.308,0.8574,2.6168,44988.4434


IntProgress(value=0, description='Processing: ', max=7)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE


Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


WorkerInterrupt: 