## **EDA + flaml ensemble**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install flaml
!pip install catboost

In [None]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import sklearn
import sys
# import pandas_profiling
import seaborn as sns
import random as rn
import os
import scipy.stats as stats
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

from sklearn.model_selection import GridSearchCV, cross_val_score, RepeatedKFold
from sklearn import metrics

from sklearn.linear_model import ElasticNet
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
# from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from mlxtend.regressor import StackingCVRegressor

from collections import Counter
from flaml import AutoML

import warnings
%matplotlib inline
warnings.filterwarnings(action='ignore')

# reproducibility
def set_seed():
  seed_num = 42 
  # tf.random.set_seed(seed_num)
  np.random.seed(seed_num)
  rn.seed(seed_num)
  os.environ['PYTHONHASHSEED']=str(seed_num)

seed_num = 42

train = pd.read_csv('/content/drive/MyDrive/Forecasting_price/dataset/train.csv')
test = pd.read_csv('/content/drive/MyDrive/Forecasting_price/dataset/test.csv')

train.drop(['id'], axis=1, inplace=True)
test.drop(['id'], axis=1, inplace=True)

df_train = train.copy()
df_test = test.copy()


df_train['company'] = df_train['title'].apply(lambda x:x.split()[0])
df_test['company'] = df_test['title'].apply(lambda x:x.split()[0])


comp_list = df_train[['company', 'target']].groupby(['company'], as_index=False).mean().sort_values(by='target', ascending=True,ignore_index=True)
for i,br in enumerate(comp_list.company):
    df_train = df_train.replace({
    br : 10*i,
    })
    
    df_test = df_test.replace({
    br : 10*i,
    })

only_test_com = list(set(df_test['company'])-set(df_train['company']))
if len(only_test_com) != 0:
    for k in range(len(only_test_com)):
      print(only_test_com)
      df_test.loc[(df_test['company'] == only_test_com[k]), 'company'] = 0

def location_fix(x):
  x['location'] = x['location'].str.lower()

  x['location'] = x['location'].apply(lambda x : 'lagos' if x.find('lagos') >= 0 else x)
  x['location'] = x['location'].apply(lambda x : 'ogun' if x.find('ogun') >= 0 else x)
  x['location'] = x['location'].apply(lambda x : 'abuja' if x.find('abuja') >= 0 else x)
  x['location'] = x['location'].apply(lambda x : 'abia' if x.find('abia') >= 0 else x)

location_fix(df_train)
location_fix(df_test)

only_test_com = list(set(df_test['location'])-set(df_train['location']))
if len(only_test_com) != 0:
    for k in range(len(only_test_com)):
      print(only_test_com)
      df_test.loc[(df_test['location'] == only_test_com[k]), 'location'] = 'lagos'

def engine_fix(df):
  df.loc[((df['engine'] != "8-cylinder(V8)") & (df['engine'] != "4-cylinder(H4)") & (df['engine'] != "6-cylinder(I6)") & 
          (df['engine'] != "6-cylinder(V6)") & (df['engine'] != "4-cylinder(I4)") & (df['engine'] != "5-cylinder(I5)") & (df['engine'] != "3-cylinder(I3)") & (df['engine'] != "2-cylinder(I2)")), 'engine'] = 90
    
  df.loc[(df['engine'] == "2-cylinder(I2)"), 'engine'] = 10
  df.loc[(df['engine'] == "3-cylinder(I3)"), 'engine'] = 20
  df.loc[(df['engine'] == "5-cylinder(I5)"), 'engine'] = 30
  df.loc[(df['engine'] == "4-cylinder(I4)"), 'engine'] = 40
  df.loc[(df['engine'] == "6-cylinder(I6)"), 'engine'] = 50
  df.loc[(df['engine'] == "6-cylinder(V6)"), 'engine'] = 60
  df.loc[(df['engine'] == "4-cylinder(H4)"), 'engine'] = 70
  df.loc[(df['engine'] == "8-cylinder(V8)"), 'engine'] = 80

engine_fix(df_train)
engine_fix(df_test)

def color_handling(x):
  x['paint'] = x['paint'].str.strip()   # eliminate empty space
  x['paint'] = x['paint'].str.lower()    # convert to lower case
  x['paint'] = x['paint'].str.replace(".", "")

color_handling(df_train)
color_handling(df_test)

def color_fix(x):
  x['paint'] = x['paint'].apply(lambda x : 'blue' if x.find('blue') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'blue' if x.find('navy') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'blue' if x.find('indigo') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'red' if x.find('red') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'green' if x.find('green') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'green' if x.find('golf') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'grey' if x.find('grey') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'grey' if x.find('gery') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'grey' if x.find('gray') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'ash' if x.find('ash') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'brown' if x.find('brown') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'silver' if x.find('silver') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'silver' if x.find('sliver') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'black' if x.find('blac') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'gold' if x.find('gold') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'red' if x.find('whine') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'red' if x.find('wine') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'white' if x.find('white') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'cream' if x.find('milk') >= 0 else x)
  x['paint'] = x['paint'].apply(lambda x : 'red' if x.find('maroon') >= 0 else x)

color_fix(df_train)
color_fix(df_test)

df_train['colorType'] = train['paint'].apply(lambda x:'chromatic' if x=='red' or x=='blue' or x=='brown' or x=='gold' or x=='green' or x=='orange' or x=='purple' or x=='yellow' else 'achromatic') 
df_test['colorType'] = test['paint'].apply(lambda x:'chromatic' if x=='red' or x=='blue' or x=='brown' or x=='gold' or x=='green' or x=='orange' or x=='purple' or x=='yellow' else 'achromatic') 

df_train['year'] = df_train['year'].apply(lambda x:0 if x<1900 or x>2022 else x)
df_test['year'] = df_test['year'].apply(lambda x:0 if x<1900 or x>2022 else x)

df_train.loc[341,'target'] = 33015000
df_train.loc[569,'target'] = 29015000
df_train.loc[736,'target'] = 60015000

train_data = df_train.copy()
test_data = df_test.copy()

train_data['target'] = train_data['target'].apply(lambda x:np.log(x))
cat_fts2 = ['title', 'location', 'isimported', 'transmission', 'fuel', 'paint', 'colorType']

np.where(train_data['title']==400)
train_data  = train_data .drop(827, axis = 0).reset_index(drop = True)

for i in range(len(cat_fts2)):
  onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse = False)

  transformed = onehot_encoder.fit_transform(train_data[cat_fts2[i]].to_numpy().reshape(-1, 1))
  onehot_df = pd.DataFrame(transformed, columns=onehot_encoder.get_feature_names())
  train_data = pd.concat([train_data, onehot_df], axis=1).drop(cat_fts2[i], axis=1)

  test_transformed = onehot_encoder.transform(test_data[cat_fts2[i]].to_numpy().reshape(-1, 1))
  test_onehot_df = pd.DataFrame(test_transformed, columns=onehot_encoder.get_feature_names())
  test_data = pd.concat([test_data, test_onehot_df], axis=1).drop(cat_fts2[i], axis=1)

train_x = train_data.drop(['target'], axis = 1)
train_y = train_data['target']
test_x = test_data

print(train_x.shape)
print(train_y.shape)
print(test_x.shape)

train_xx = np.array(train_x)
train_yy = np.array(train_y)
test_xx = np.array(test_x)

MODEL_TIME_BUDGET = 60*5
MODEL_METRIC = 'mae'
MODEL_TASK = "regression"
MODEL_LIST = ["lgbm"]

auto_lgbm = AutoML()
params = {
    "time_budget": MODEL_TIME_BUDGET,  
    "metric": MODEL_METRIC,
    "estimator_list": MODEL_LIST, 
    "task": MODEL_TASK,
    "seed":seed_num,
}
auto_lgbm.fit(train_x, train_y, **params)

MODEL_TIME_BUDGET = 60*5
MODEL_METRIC = 'mae'
MODEL_TASK = "regression"
MODEL_LIST = ["catboost"]

auto_cat = AutoML()
params = {
    "time_budget": MODEL_TIME_BUDGET,  
    "metric": MODEL_METRIC,
    "estimator_list": MODEL_LIST, 
    "task": MODEL_TASK,
    "seed":seed_num,
}
auto_cat.fit(train_x, train_y, **params)

MODEL_TIME_BUDGET = 60*5
MODEL_METRIC = 'mae'
MODEL_TASK = "regression"
MODEL_LIST = ["xgboost"]

auto_xgb = AutoML()
params = {
    "time_budget": MODEL_TIME_BUDGET,  
    "metric": MODEL_METRIC,
    "estimator_list": MODEL_LIST, 
    "task": MODEL_TASK,
    "seed":seed_num,
}
auto_xgb.fit(train_x, train_y, **params)

MODEL_TIME_BUDGET = 60*5
MODEL_METRIC = 'mae'
MODEL_TASK = "regression"
MODEL_LIST = ["rf"]

auto_rf = AutoML()
params = {
    "time_budget": MODEL_TIME_BUDGET,  
    "metric": MODEL_METRIC,
    "estimator_list": MODEL_LIST, 
    "task": MODEL_TASK,
    "seed":seed_num,
}
auto_rf.fit(train_x, train_y, **params)

MODEL_TIME_BUDGET = 60*5
MODEL_METRIC = 'mae'
MODEL_TASK = "regression"
MODEL_LIST = ["extra_tree"]

auto_ext = AutoML()
params = {
    "time_budget": MODEL_TIME_BUDGET,  
    "metric": MODEL_METRIC,
    "estimator_list": MODEL_LIST, 
    "task": MODEL_TASK,
    "seed":seed_num,
}
auto_ext.fit(train_x, train_y, **params)

lightgbm = auto_lgbm.model.estimator
xgboost = auto_xgb.model.estimator
catboost = auto_cat.model.estimator
randomforest = auto_rf.model.estimator
extratree =  auto_ext.model.estimator

stack_reg = StackingCVRegressor(regressors=(lightgbm, xgboost, catboost, randomforest, extratree),
                                meta_regressor = xgboost,
                                use_features_in_secondary=True)
stack_reg = stack_reg.fit(train_xx, train_yy)

prediction = (auto_lgbm.predict(test_x) + auto_xgb.predict(test_x) + auto_cat.predict(test_x) + auto_rf.predict(test_x) + auto_ext.predict(test_x) + stack_reg.predict(test_xx))/6
prediction = np.exp(prediction)

submission = pd.read_csv('/content/drive/MyDrive/Forecasting_price/dataset/sample_submission.csv')
submission['target'] = prediction

submission.to_csv('/content/drive/MyDrive/Forecasting_price/submit.csv', index=False)