# Regression with categorical data starter pack

## Introduction
Greetings from the Kaggle bot! This is an automatically-generated kernel with starter code demonstrating how to read in the data and begin exploring. If you're inspired to dig deeper, click the blue "Fork Notebook" button at the top of this kernel to begin editing.

## Exploratory Analysis
To begin this exploratory analysis, first import libraries and define functions for plotting the data using matplotlib. Depending on the data, not all plots will be made. (Hey, I'm just a simple kerneling bot, not a Kaggle Competitions Grandmaster!)

# Some import

In [None]:
import os # accessing directory structure
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
import numpy as np 
import pandas as pd 
import pandas_profiling
import pandas_summary as ps
import shap


# Data processing, metrics and modeling
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.decomposition import PCA
import collections

# Lgbm
import lightgbm as lgb

# Hyper_opt
from hyperopt import hp
from hyperopt import fmin, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
import hyperopt.pyll
from hyperopt.pyll import scope

# Suppr warning
import warnings
warnings.filterwarnings("ignore")

# Plots
import matplotlib
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams

# Others
import shap
import datetime
from tqdm import tqdm_notebook
import sys
import pickle
import re
import json

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('use_inf_as_na', True)

warnings.simplefilter('ignore')
matplotlib.rcParams['figure.dpi'] = 100
sns.set()
%matplotlib inline

### Immediately, do the right thing. Reduce the amount of memory under the data frame when reading.

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train_df = reduce_mem_usage(pd.read_csv('../input/regression-with-categorical-data/train.csv'))
test_df = reduce_mem_usage(pd.read_csv('../input/regression-with-categorical-data/test.csv'))

Good optimization

## EDA

In [None]:
train_df.shape, test_df.shape

## With this amount, you can use gradient boosting.

In [None]:
train_df.head()

In [None]:
train_df.tail()

In [None]:
test_df.head()

In [None]:
test_df.tail()

Using pandas_summary we’ll see describe the columns. By the way, here you can immediately see if reduce_mem_usage worked correctly, in some cases it gives inf ... If something went wrong, then the infs will appear in min and max.

In [None]:
dfs = ps.DataFrameSummary(train_df)
print('categoricals: ', dfs.categoricals.tolist())
print('numerics: ', dfs.numerics.tolist())
dfs.summary()

In [None]:
dfs = ps.DataFrameSummary(test_df)
print('categoricals: ', dfs.categoricals.tolist())
print('numerics: ', dfs.numerics.tolist())
dfs.summary()

# Target:

In [None]:
train_df['target'].hist();

In [None]:
train_df[train_df['target']>25000]['target'].hist();

### Log it

In [None]:
train_df['target'] = np.log1p(train_df['target'])

In [None]:
train_df['target'].hist();

In [None]:
y = train_df['target']
train_df.drop('target', axis=1, inplace=True)

## Much better!

In the id column, some insider is unlikely to be sewn up, we will drop. It’s more correct, of course, to index the data frame by id, but there is no task to build a pipeline, so we’ll omit this moment)

In [None]:
train_df.drop('id', axis=1, inplace=True)
test_df.drop('id', axis=1, inplace=True)

##### Let's check the columns in the test and train

In [None]:
train_df.columns.tolist() == test_df.columns.tolist()

## Let's look at the distributions, correlations, and other characteristics of the samples.

In [None]:
pandas_profiling.ProfileReport(train_df)

In [None]:
pandas_profiling.ProfileReport(test_df)

## Feature engineering will not do. Hypothesis - the number of features is excessive. We will look for top features and exclude those that negatively affect quality.

In [None]:
cat = train_df.select_dtypes(include=['category']).columns.tolist()

In [None]:
X_train, X_val, y_train, y_val = train_test_split(train_df, y, test_size=0.25, random_state=777)

## We will use more modern methods. We will choose hyperparameters as a hyperopt, and not a gridscaerch.

In [None]:
%%time
# Чем больше P, тем меньше мы хотим штрафовать за разницу между train  и test
p = 0.8
# k - количество итераций
k = 30

skf = KFold(n_splits=3, shuffle=True, random_state=7)

def score(params):
    print('Training with params:')
    print(params)
    w=[]
    best_iter = []
    

    for train_index, val_index in skf.split(X_train, y_train):
        x_train_1, x_valid_1 = X_train.iloc[train_index, :], X_train.iloc[val_index, :]
        y_train_1, y_valid_1 = y_train.iloc[train_index], y_train.iloc[val_index]
        train_data = lgb.Dataset(x_train_1, label=y_train_1, categorical_feature=cat)
        val_data = lgb.Dataset(x_valid_1, label=y_valid_1, categorical_feature=cat, reference=train_data)
        gbm = lgb.train(params,
                        train_data,
                        valid_sets = [train_data, val_data],
                        valid_names=['train', 'val'],
                        num_boost_round = 5000,
                        verbose_eval = False, 
                        categorical_feature=cat
                       )
        w.append([gbm.best_score['train']['rmse'], gbm.best_score['val']['rmse']])
        best_iter.append(gbm.best_iteration)
    nrounds = np.mean(best_iter)
    print('best iter:', int(nrounds), 'all iter:', best_iter)
    res = list(np.mean(w, axis=0))
    print("\t rmse train {0}, rmse test {1}, dif {2}, \n\t final score {3} \n\n".format(res[0], res[1], np.power(np.square(res[0]-res[1]), p), +res[1]+np.power(np.square(res[0]-res[1]), p)))
    return {'loss': +res[1]+np.power(np.square(res[0]-res[1]), p), 'status': STATUS_OK, 
            'mean_rmse_train': res[0], 'mean_rmse_test': res[1], 'best_iter': int(nrounds)}

def optimize(trials):
    space = {
        #'max_depth': hp.choice('max_depth', [-1, 6, 7]),
        'max_depth': -1,
        'max_bin': scope.int(hp.uniform('max_bin', 100, 2500)),
        'num_leaves': scope.int(hp.uniform('num_leaves', 20, 200)),
        'lambda_l1': hp.quniform('lambda_l1', 0, 8, 0.25),
        'learning_rate': hp.quniform('learning_rate', 0.01 , 0.05, 0.005),
        'bagging_fraction': hp.quniform('bagging_fraction', 0.3, 0.9, 0.1),
        'metric': ('rmse',),
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': 8,
        'early_stopping_rounds': 20,
        'silent':1,
    }
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=k)
    print(best)

trials = Trials()
optimize(trials)

# Let's train a model with selected hyperparameters.

In [None]:
params = trials.best_trial['misc']['vals']
params['max_depth'] = -1
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = ('l1', 'l2')
params['nthread'] = 8
params['early_stopping_rounds'] = 100
params['silent'] = 1
params['num_leaves'] = int(params['num_leaves'][0])
params['max_bin'] = int(params['max_bin'][0])
params

In [None]:
train_ds = lgb.Dataset(X_train, label=y_train, categorical_feature=cat, )
val_ds = lgb.Dataset(X_val, label=y_val, categorical_feature=cat, reference=train_ds)

In [None]:
%%time
booster = lgb.train(params, train_ds, num_boost_round=10000, valid_sets=[train_ds, val_ds], valid_names=['train', 'valid'], verbose_eval=100, categorical_feature=cat, early_stopping_rounds=100)

Значимость фичей по gain lightgbm:

In [None]:
gain = booster.feature_importance(importance_type='gain')
total = sum(gain)
tmp = pd.DataFrame({'Name': X_train.columns.tolist(), 'Value': gain/total})
tmp = tmp.sort_values('Value', ascending=False)
tmp.index = range(1, tmp.shape[0]+1)
tmp.head(10)

## Choosing the number of features by gain
From each iteration, we will drop from lightgbm training one bundle according to min gain. For us, the difference between the train and the test, as well as the metric on validation, is important

In [None]:
%%time
i = 0
f = pd.DataFrame(columns=['Number_of_cols', 'rmse_test', 'rmse_train', 'rmse_diff'])
k = tmp.shape[0]
while k > 10:
    columns = list(tmp.loc[:k, 'Name'])
    w=[]
    best_iter = []
    for train_index, val_index in skf.split(X_train, y_train):
        x_train_2, x_valid_2 = X_train.loc[train_index, columns], X_train.loc[val_index, columns]
        y_train_2, y_valid_2 = y_train.iloc[train_index], y_train.iloc[val_index]
        cat_2 = x_train_2.select_dtypes(include=['category']).columns.tolist()
        train_data_2 = lgb.Dataset(x_train_2, label=y_train_2, categorical_feature=cat_2)
        val_data_2 = lgb.Dataset(x_valid_2, label=y_valid_2, categorical_feature=cat_2, reference=train_data_2)
        gbm_2 = lgb.train(params,
                        train_data_2,
                        valid_sets = [train_data_2, val_data_2],
                        valid_names=['train', 'val'],
                        num_boost_round = 5000,
                        verbose_eval = False, 
                        categorical_feature=cat_2
                       )
        w.append([gbm_2.best_score['train']['l1'], gbm_2.best_score['val']['l1']])
        best_iter.append(gbm_2.best_iteration)
    nrounds = np.mean(best_iter)
    res = list(np.mean(w, axis=0))
    
    rmse_test = res[0]
    rmse_train = res[1]
    f.loc[i, :] = k, rmse_test, rmse_train, rmse_test - rmse_train
    print('n columns ', k, 'rmse_test ', rmse_test, 'rmse_train ', rmse_train, 'diff ', rmse_train - rmse_test)
    i+=1
    k-=5

# HOW MUCH NOISE !!!!

Choose the optimal number of features by the difference and the validation metric. More than half of the features do not carry informational content.

In [None]:
tmp.head(55)

In [None]:
column = tmp.head(55)['Name'].tolist()

### Now we narrow the selection ranges with a hyperparameter and remove insignificant features

In [None]:
%%time
# Чем больше P, тем меньше мы хотим штрафовать за разницу между train  и test
p = 0.8
# k - количество итераций
k = 250

cat = X_train[column].select_dtypes(include=['category']).columns.tolist()

skf = KFold(n_splits=3, shuffle=True, random_state=7)

def score(params):
    print('Training with params:')
    print(params)
    w=[]
    best_iter = []
    

    for train_index, val_index in skf.split(X_train, y_train):
        x_train_1, x_valid_1 = X_train[column].iloc[train_index, :], X_train[column].iloc[val_index, :]
        y_train_1, y_valid_1 = y_train.iloc[train_index], y_train.iloc[val_index]
        train_data = lgb.Dataset(x_train_1, label=y_train_1, categorical_feature=cat)
        val_data = lgb.Dataset(x_valid_1, label=y_valid_1, categorical_feature=cat, reference=train_data)
        gbm = lgb.train(params,
                        train_data,
                        valid_sets = [train_data, val_data],
                        valid_names=['train', 'val'],
                        num_boost_round = 5000,
                        verbose_eval = False, 
                        categorical_feature=cat
                       )
        w.append([gbm.best_score['train']['rmse'], gbm.best_score['val']['rmse']])
        best_iter.append(gbm.best_iteration)
    nrounds = np.mean(best_iter)
    print('best iter:', int(nrounds), 'all iter:', best_iter)
    res = list(np.mean(w, axis=0))
    print("\t rmse train {0}, rmse test {1}, dif {2}, \n\t final score {3} \n\n".format(res[0], res[1], np.power(np.square(res[0]-res[1]), p), +res[1]+np.power(np.square(res[0]-res[1]), p)))
    return {'loss': +res[1]+np.power(np.square(res[0]-res[1]), p), 'status': STATUS_OK, 
            'mean_rmse_train': res[0], 'mean_rmse_test': res[1], 'best_iter': int(nrounds)}

def optimize(trials):
    space = {
        #'max_depth': hp.choice('max_depth', [-1, 6, 7]),
        'max_depth': -1,
        'max_bin': scope.int(hp.uniform('max_bin', 1000, 2000)),
        'num_leaves': scope.int(hp.uniform('num_leaves', 8, 60)),
        'lambda_l1': hp.quniform('lambda_l1', 0.5, 2.25, 0.125),
        'learning_rate': hp.quniform('learning_rate', 0.03 , 0.06, 0.005),
        'bagging_fraction': hp.quniform('bagging_fraction', 0.6, 0.9, 0.01),
        'metric': ('rmse',),
        'boosting_type': 'gbdt',
        'objective': 'regression',
        'nthread': 8,
        'early_stopping_rounds': 20,
        'silent':1,
    }
    best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=k)
    print(best)

trials = Trials()
optimize(trials)

The error is clearly reduced!

In [None]:
params = trials.best_trial['misc']['vals']
params['max_depth'] = -1
params['boosting_type'] = 'gbdt'
params['objective'] = 'regression'
params['metric'] = ('l1', 'l2')
params['nthread'] = 8
params['early_stopping_rounds'] = 100
params['silent'] = 1
params['num_leaves'] = int(params['num_leaves'][0])
params['max_bin'] = int(params['max_bin'][0])
params

In [None]:
train_ds = lgb.Dataset(X_train[column], label=y_train, categorical_feature=cat, )
val_ds = lgb.Dataset(X_val[column], label=y_val, categorical_feature=cat, reference=train_ds)

In [None]:
%%time
booster = lgb.train(params, train_ds, num_boost_round=10000, valid_sets=[train_ds, val_ds], valid_names=['train', 'valid'], verbose_eval=100, categorical_feature=cat, early_stopping_rounds=100)

# Feature importance

In [None]:
shap.initjs()
explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(X_val[column])

In [None]:
shap.summary_plot(shap_values, X_val[column], plot_type='bar', max_display=30)

# Predict

Do not forget to expose the predict

In [None]:
pred_train = np.expm1(booster.predict(X_train[column]))
pred_val = np.expm1(booster.predict(X_val[column]))
pred_test = np.expm1(booster.predict(test_df[column]))

# Metrics

In [None]:
from sklearn import metrics

# Train

In [None]:
print(metrics.mean_absolute_error(y_train, pred_train))
print(metrics.mean_squared_error(y_train, pred_train))
print(metrics.mean_squared_log_error(y_train, pred_train))
print(metrics.median_absolute_error(y_train, pred_train))

# Val

In [None]:
print(metrics.mean_absolute_error(y_val, pred_val))
print(metrics.mean_squared_error(y_val, pred_val))
print(metrics.mean_squared_log_error(y_val, pred_val))
print(metrics.median_absolute_error(y_val, pred_val))

# Here you need to highlight the bucket with the business and calculate the metric that is understandable for the business -% of the bucket getting into the bucket. You can also calculate the% hit in the 20% interval.

### Check out my guide to linear regressions:
https://www.kaggle.com/podsyp/complete-linear-model-guide