In [7]:
# Load Libaries Needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
import category_encoders as ce

In [8]:
X_train=pd.read_csv('X_train.csv',index_col='Rank')
X_test=pd.read_csv('X_test.csv',index_col='Rank')
y_train=pd.read_csv('y_train.csv',index_col='Rank')
y_test=pd.read_csv('y_test.csv',index_col='Rank')

In [9]:
# Cat Boost Encoding 
cat_features= ['Genre', 'Publisher','Device']       
target_enc = ce.CatBoostEncoder(cols=cat_features)
target_enc.fit(X_train[cat_features], y_train['Global_Sales'])

# Transform the features, rename the columns with _target suffix, and join to dataframe
train_TE = X_train.join(target_enc.transform(X_train[cat_features]).add_suffix('_target'))
test_TE = X_test.join(target_enc.transform(X_test[cat_features]).add_suffix('_target'))

  elif pd.api.types.is_categorical(cols):


In [10]:
X_train=train_TE.drop(columns=cat_features)
X_test=test_TE.drop(columns=cat_features)

In [11]:
y_train=np.reshape(y_train.values,len(y_train))
y_test=np.reshape(y_test.values,len(y_test))

In [12]:
bool_features=['FIFA', 'Mario', 'Call of Duty', 'Grand Theft Auto', 'Pokemon', 'Halo',
       'Wii', 'NBA']
num_features=['Year','NA_Sales As a percentage of Total',
       'EU_Sales As a percentage of Total',
       'JP_Sales As a percentage of Total',
       'Other_Sales As a percentage of Total','Genre_target','Publisher_target','Device_target']

In [13]:
# Create Pipeline 
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler

# Preprocessing for numerical data
numerical_transformer = Pipeline(steps=[('imputer',SimpleImputer(strategy='median')),('normalize',MinMaxScaler())])
bool_transformer=SimpleImputer(strategy='constant')

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, num_features),('bool',bool_transformer,bool_features)])

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val  = train_test_split(X_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.9 = 0.22

In [15]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
def score(model):
    my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)])
    my_pipeline.fit(X_train,y_train)
    pred=my_pipeline.predict(X_val)
    MAE=mean_absolute_error(pred,y_val)
    print(MAE)

In [16]:
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor,VotingRegressor

In [17]:
# The best models
best_xgb=XGBRegressor(random_state=1,booster='gbtree',eta=0.30,gamma=0,max_depth=6,min_child_weight=1,
                      max_delta_step=0,subsample=1,reg_lambda=1,n_estimators=366,tree_method='auto')
best_rf=RandomForestRegressor(n_estimators=1000,max_features='sqrt', bootstrap = False,
                              max_depth=100,min_samples_split =2,min_samples_leaf = 1,random_state=1)

In [18]:
vr=VotingRegressor(estimators=[('rf',best_rf),('xgb',best_xgb)])
score(vr)

0.3067203086684404


In [20]:
for i in [[1,1],[2,1],[1,2]]:
    vr=VotingRegressor(estimators=[('rf',best_rf),('xgb',best_xgb)],weights=i)
    score(vr)

0.3067203086684404
0.3070206391717407
0.30979026410662674


In [21]:
# Best Model 
best_vr=VotingRegressor(estimators=[('rf',best_rf),('xgb',best_xgb)],weights=[1,1])