Lasso, Ridge, Randomforest, ElasticNet, GradientBoostingRegressor, LGBM(dart), XGB(otherBoosting), AdaBoostRegressor

# 1. Import

In [1]:
# Basic Library
import os
import pandas as pd
import numpy as np
import pickle
from itertools import permutations, combinations
from tqdm.auto import tqdm
import warnings
warnings.filterwarnings('ignore')

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, QuantileTransformer

# HP Tuning
import optuna
from optuna import Trial
from optuna.samplers import TPESampler
from optuna.visualization import plot_contour, plot_optimization_history
from optuna.visualization import plot_parallel_coordinate, plot_slice, plot_param_importances

# Modeling
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from lightgbm import LGBMRegressor
import xgboost as xgb

In [2]:
def create_dir(dir):
    if not os.path.exists(dir):
        os.makedirs(dir)
        print("Created Directory :", dir)
    else:
        print("Directory already existed :", dir)
create_dir("../pickle")
create_dir("../model")
create_dir("../submission")

Directory already existed : ../pickle
Directory already existed : ../model
Directory already existed : ../submission


In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")
submission = pd.read_csv("../data/sample_submission.csv")

train = train.drop('id', axis=1)
test = test.drop('id', axis=1)

# 2. Preprocessing

In [4]:
# 전체 무게 - (껍질 무게 + 껍질이 아닌 무게) Feature 생성
train['Water Weight'] = train['Whole Weight'] - (train['Shucked Weight'] + train['Shell Weight'])
test['Water Weight'] = test['Whole Weight'] - (test['Shucked Weight'] + test['Shell Weight'])

# 0.005보다 낮은 수는 0.005로 대체
train.loc[train[(train['Water Weight']<0.0005)].index, "Water Weight"] = 0.0005
test.loc[test[(test['Water Weight']<0.0005)].index, "Water Weight"] = 0.0005

In [5]:
train = train.replace(0.0, 0.01)
test = test.replace(0.0, 0.01)

In [6]:
scaler1 = StandardScaler()
scaler2 = MinMaxScaler()
scaler3 = QuantileTransformer()
def feature_num_scaler(train_df, test_df):
    for num_col in num_cols:
        
        scaler1.fit(train[[num_col]])
        train_df[num_col+'#scaler1'] = scaler1.transform(train[[num_col]])
        test_df[num_col+'#scaler1'] = scaler1.transform(test_df[[num_col]])
        train_df[num_col+'#scaler1'] = train_df[num_col+'#scaler1'].replace(0.0, 0.001)
        test_df[num_col+'#scaler1'] = test_df[num_col+'#scaler1'].replace(0.0, 0.001)
        
        scaler2.fit(train[[num_col]])
        train_df[num_col+'#scaler2'] = scaler2.transform(train[[num_col]])
        test_df[num_col+'#scaler2'] = scaler2.transform(test_df[[num_col]])
        train_df[num_col+'#scaler2'] = train_df[num_col+'#scaler2'].replace(0.0, 0.001)
        test_df[num_col+'#scaler2'] = test_df[num_col+'#scaler2'].replace(0.0, 0.001)
        
        scaler3.fit(train[[num_col]])
        train_df[num_col+'#scaler3'] = scaler3.transform(train[[num_col]])
        test_df[num_col+'#scaler3'] = scaler3.transform(test_df[[num_col]])
        train_df[num_col+'#scaler3'] = train_df[num_col+'#scaler3'].replace(0.0, 0.001)
        test_df[num_col+'#scaler3'] = test_df[num_col+'#scaler3'].replace(0.0, 0.001)
        
        train_df[num_col+'#log2'] = np.log2(train_df[num_col])
        test_df[num_col+'#log2'] = np.log2(test_df[num_col])
        train_df[num_col+'#log2'] = train_df[num_col+'#log2'].replace(0.0, 0.006)
        test_df[num_col+'#log2'] = test_df[num_col+'#log2'].replace(0.0, 0.006)
        
        train_df[num_col+'#log'] = np.log(train_df[num_col])
        test_df[num_col+'#log'] = np.log(test_df[num_col])
        train_df[num_col+'#log'] = train_df[num_col+'#log'].replace(0.0, 0.004)
        test_df[num_col+'#log'] = test_df[num_col+'#log'].replace(0.0, 0.004)
        
        train_df[num_col+'#log10'] = np.log10(train_df[num_col])
        test_df[num_col+'#log10'] = np.log10(test_df[num_col])
        train_df[num_col+'#log10'] = train_df[num_col+'#log10'].replace(0.0, 0.002)
        test_df[num_col+'#log10'] = test_df[num_col+'#log10'].replace(0.0, 0.002)

In [7]:
def feature_cat_generation(df):

    for cat_col in cat_cols:
        for num_col in num_cols:        
            new_name = cat_col + "#mean#" + num_col
            grouped = df.groupby(cat_col)[num_col].mean()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#std#" + num_col
            grouped = df.groupby(cat_col)[num_col].std(ddof = 1)
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#var#" + num_col
            grouped = df.groupby(cat_col)[num_col].var(ddof = 1)
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#max#" + num_col
            grouped = df.groupby(cat_col)[num_col].max()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#min#" + num_col
            grouped = df.groupby(cat_col)[num_col].min()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#ptp#" + num_col
            grouped = df.groupby(cat_col)[num_col].agg(np.ptp)
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#median" + num_col
            grouped = df.groupby(cat_col)[num_col].median()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#skew" + num_col
            grouped = df.groupby(cat_col)[num_col].skew()
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#percentile_10" + num_col
            grouped = df.groupby(cat_col)[num_col].agg(lambda x: np.percentile(x, 10))
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#percentile_50" + num_col
            grouped = df.groupby(cat_col)[num_col].agg(lambda x: np.percentile(x, 50))
            df[new_name] = df[cat_col].map(grouped)

            new_name = cat_col + "#percentile_90" + num_col
            grouped = df.groupby(cat_col)[num_col].agg(lambda x: np.percentile(x, 90))
            df[new_name] = df[cat_col].map(grouped)
    
    return df

In [8]:
cat_cols = []
num_cols = []
for col in train.columns:
    if train[col].dtypes=='object':
        cat_cols.append(col)
    elif train[col].dtypes=='float64':
        num_cols.append(col)

In [9]:
feature_num_scaler(train, test)

In [10]:
cat_cols = []
num_cols = []
for col in train.columns:
    if train[col].dtypes=='object':
        cat_cols.append(col)
    elif train[col].dtypes=='float64':
        num_cols.append(col)

In [11]:
for num_col_first in num_cols:
    for num_col_second in num_cols:
        if (num_col_first != num_col_second):
            train[num_col_first+'/'+num_col_second] = train[num_col_first] / train[num_col_second]
            train[num_col_first+'*'+num_col_second] = train[num_col_first] * train[num_col_second]
            test[num_col_first+'/'+num_col_second] = test[num_col_first] / test[num_col_second]
            test[num_col_first+'*'+num_col_second] = test[num_col_first] * test[num_col_second]

In [12]:
feature_cat_generation(train)
feature_cat_generation(test)

Unnamed: 0,Gender,Lenght,Diameter,Height,Whole Weight,Shucked Weight,Viscra Weight,Shell Weight,Water Weight,Lenght#scaler1,...,Gender#std#Water Weight#log10,Gender#var#Water Weight#log10,Gender#max#Water Weight#log10,Gender#min#Water Weight#log10,Gender#ptp#Water Weight#log10,Gender#medianWater Weight#log10,Gender#skewWater Weight#log10,Gender#percentile_10Water Weight#log10,Gender#percentile_50Water Weight#log10,Gender#percentile_90Water Weight#log10
0,F,0.595,0.470,0.155,1.1210,0.4515,0.1780,0.1550,0.5145,0.600176,...,0.223656,0.050022,-0.011887,-1.619789,1.607902,-0.551294,-0.840462,-0.865505,-0.551294,-0.315963
1,M,0.580,0.450,0.150,0.9270,0.2760,0.1815,0.3600,0.2910,0.475366,...,0.306467,0.093922,-0.059235,-3.301030,3.241795,-0.569441,-2.229112,-0.985480,-0.569441,-0.348239
2,I,0.260,0.205,0.070,0.0970,0.0415,0.0190,0.0305,0.0250,-2.187241,...,0.431620,0.186296,-0.150581,-3.301030,3.150449,-1.000000,-1.916843,-1.593460,-1.000000,-0.671315
3,M,0.590,0.460,0.130,1.1020,0.4550,0.2055,0.3300,0.3170,0.558572,...,0.306467,0.093922,-0.059235,-3.301030,3.241795,-0.569441,-2.229112,-0.985480,-0.569441,-0.348239
4,F,0.595,0.465,0.140,1.1130,0.5175,0.2440,0.3050,0.2905,0.600176,...,0.223656,0.050022,-0.011887,-1.619789,1.607902,-0.551294,-0.840462,-0.865505,-0.551294,-0.315963
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2919,I,0.170,0.105,0.035,0.0340,0.0120,0.0085,0.0050,0.0170,-2.936099,...,0.431620,0.186296,-0.150581,-3.301030,3.150449,-1.000000,-1.916843,-1.593460,-1.000000,-0.671315
2920,I,0.435,0.345,0.115,0.4180,0.2220,0.0735,0.1060,0.0900,-0.731128,...,0.431620,0.186296,-0.150581,-3.301030,3.150449,-1.000000,-1.916843,-1.593460,-1.000000,-0.671315
2921,I,0.570,0.450,0.135,0.7940,0.3815,0.1415,0.2450,0.1675,0.392159,...,0.431620,0.186296,-0.150581,-3.301030,3.150449,-1.000000,-1.916843,-1.593460,-1.000000,-0.671315
2922,I,0.460,0.350,0.120,0.4885,0.1930,0.1050,0.1550,0.1405,-0.523112,...,0.431620,0.186296,-0.150581,-3.301030,3.150449,-1.000000,-1.916843,-1.593460,-1.000000,-0.671315


In [13]:
train_ce = train.copy()
test_ce = test.copy()

for col in train_ce.columns:
    if train_ce[col].dtypes=='object':
        train_ce[col] = train_ce[col].astype('category')
        test_ce[col] = test_ce[col].astype('category')

In [14]:
train_ohe = train.copy()
test_ohe = test.copy()

train_ohe = pd.get_dummies(train_ohe)
test_ohe = pd.get_dummies(test_ohe)

In [15]:
def reduce_mem_usage(data):
    numerics = ['int8', 'int16', 'int32', 'int64', 'float32', 'float64']
    start_memory = data.memory_usage().sum() / 1024**2    
    for col in data.columns:
        col_type = data[col].dtypes
        if col_type in numerics:
            c_min = data[col].min()
            c_max = data[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    data[col] = data[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    data[col] = data[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    data[col] = data[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    data[col] = data[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    data[col] = data[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    data[col] = data[col].astype(np.float32)
                else:
                    data[col] = data[col].astype(np.float64)    
    end_memory = data.memory_usage().sum() / 1024**2
    print('Memory optimization from {:5.2f}MB to {:5.2f}MB ({:.1f}% reduction)'
          .format(start_memory, end_memory, 100 * (start_memory - end_memory) / start_memory))
    return data

In [16]:
train_ce = reduce_mem_usage(train_ce)
test_ce = reduce_mem_usage(test_ce)

Memory optimization from 65.32MB to 16.33MB (75.0% reduction)
Memory optimization from 152.41MB to 38.11MB (75.0% reduction)


In [17]:
train_ohe = reduce_mem_usage(train_ohe)
test_ohe = reduce_mem_usage(test_ohe)

Memory optimization from 65.32MB to 16.33MB (75.0% reduction)
Memory optimization from 152.42MB to 38.11MB (75.0% reduction)


In [18]:
train_ce.to_csv('../data/train_ce.csv', index=False)
train_ce.to_csv('../data/test_ce.csv', index=False)

In [19]:
train_ohe.to_csv('../data/train_ohe.csv', index=False)
test_ohe.to_csv('../data/test_ohe.csv', index=False)