In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Let's begin
**From the introduction, we know that the goal is to estimate the loss which is continuous. So this project is about REGRESSION**

**Steps:**
1. Data Visualization
    * Import Data
    * Object Data Barplot
    * Numerical Data Distplot
    * Goal Data Normal Distribution
2. Correlation
    * Find the highly correlated columns
3. Data Preparation
    * Delete highly correlated columns
    * Ajust Goal Data
4. Modeling
    * LGBMCLASSIFIER
    * Export Result
5. Hyperopt
    * Find the optimized params
    * Learning Curve

# Data Visualization

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
train = pd.read_csv("/kaggle/input/allstate-claims-severity/train.csv")
test = pd.read_csv("/kaggle/input/allstate-claims-severity/test.csv")
train.shape, test.shape

In [None]:
train.head()

In [None]:
train_cat = train.iloc[:, 1:117]
train_cont = train.iloc[:, 117:-1]

In [None]:
plt.figure(figsize=(16, 150))
for i, col in enumerate(train_cat.columns):
    plt.subplot(30, 4, i+1)
    sns.countplot(train_cat[col], order=train_cat[col].value_counts().sort_index().index)
plt.tight_layout()

In [None]:
plt.figure(figsize=(16, 12))
for i, col in enumerate(train_cont.columns):
    plt.subplot(4, 4, i+1)
    sns.distplot(train_cont[col])
plt.tight_layout()

In [None]:
sns.distplot(np.log1p(train['loss']))

* For categorical cols, there are some cols which are totally unbalanced so that they might not be useful for the data analysis
* For numerical cols, normal distrition is always regarded as the best situtaion, obviously some cols is either skewed or korted or just wierd
* For numerical cols, further process to check each col is necessary -- correlation study

#  Correlation

In [None]:
corr = train.drop(columns='id').corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', linewidths=0.5)

In [None]:
high_corr= []
threshold = 0.8
for i in range(len(corr)):
    for j in range(i+1, len(corr)):
        if corr.iloc[i,j] >= threshold or (corr.iloc[i, j]<=-threshold and corr.iloc[i, j] < 0):
            high_corr.append([corr.iloc[i,j], i, j])

In [None]:
for v, i, j in high_corr:
    sns.pairplot(train_cont, x_vars=train_cont.columns[i], y_vars=train_cont.columns[j], size= 6)

* The five pairs are (1,9), (1,10), (6, 10), (6, 13), (11, 12)
* From the graphics above, the five pairs are all with high correlation
* It's necessary to remove some of them and 1, 6 are paired with two another col respectively, removing 1, 6 and 11 or 12 is my choice

# Data Preparation

In [None]:
# In order to make sure train & test sets would have same amount of cols(except loss) after modification

dataset = pd.concat([train, test])
dataset = dataset.drop(columns = ['cont1', 'cont6', 'cont11'])

In [None]:
dataset = pd.get_dummies(dataset)
df_train = dataset[:len(train)]
df_test = dataset[len(train):]
df_test = df_test.drop(columns='loss')

In [None]:
y = np.log1p(df_train['loss'])
df_train = df_train.drop(columns='loss')

# Modeling 

In [None]:
from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
x_train, x_test, y_train, y_test = train_test_split(df_train, y, test_size=0.3, random_state=0)

In [None]:
# xgb = XGBRegressor(learning_rate=0.3, n_estimators=500)
# xgb.fit(x_train, y_train)
# mean_absolute_error(np.expm1(y_test), np.expm1(xgb.predict(x_test)))

In [None]:
# xgb=XGBRegressor(seed=18, objective='reg:linear', n_jobs=-1, verbosity=0,
#                        colsample_bylevel=0.764115402027029, colsample_bynode=0.29243734009596956, 
#                        colsample_bytree= 0.7095719673041723, gamma= 4.127534050725986, learning_rate= 0.02387231810322894, 
#                        max_depth=14, min_child_weight=135, n_estimators=828,reg_alpha=0.3170105723222332, 
#                        reg_lambda= 0.3660379465131937, subsample=0.611471430211575)
# xgb.fit(x_train, y_train)
# mean_absolute_error(np.expm1(y_test), np.expm1(xgb.predict(x_test)))

In [None]:
lgb = LGBMRegressor(objective='regression_l1', random_state=18, subsample_freq=1,
                        colsample_bytree=0.3261853512759363, min_child_samples=221, n_estimators=2151, num_leaves= 45, 
                        reg_alpha=0.9113713668943361, reg_lambda=0.8220990333713991, subsample=0.49969995651550947, 
                        max_bin=202, learning_rate=0.02959820893211799)

In [None]:
lgb.fit(x_train, y_train)
mean_absolute_error(np.expm1(y_test), (np.expm1(lgb.predict(x_test))))
# mean_absolute_error(np.expm1(y_test), (np.expm1(lgb.predict(x_test))+np.expm1(xgb.predict(x_test)))/2)

In [None]:
sub = pd.DataFrame({'id': df_test['id'], 'loss': np.expm1(lgb.predict(df_test))})

In [None]:
sub.to_csv('sub.csv', index=False)

# Try hyperopt

In [None]:
from sklearn.model_selection import learning_curve

In [None]:
from hyperopt import hp, fmin, Trials, tpe, pyll

In [None]:
def f(params):
    lgb = LGBMRegressor(**params)
    lgb.fit(x_train, y_train)
    return mean_absolute_error(np.expm1(y_test), (np.expm1(lgb.predict(x_test))))
#     return -cross_val_score(LGBMRegressor(**params), df_train, y, cv=10).mean()

space = {
        'subsample_freq':hp.choice('subsample_freq', range(1, 5)),
        'colsample_bytree':hp.uniform('colsample_bytree', 0.2, 0.5), 
        'min_child_samples':hp.choice('min_child_samples', range(200, 250, 5)), 
        'n_estimators': hp.choice('n_estimators', range(1000, 3000, 100)), 
        'num_leaves': hp.choice('num_leaves', range(20, 50, 5)), 
        'reg_alpha': hp.uniform('reg_alpha', 0.70, 1), 
        'reg_lambda': hp.uniform('reg_lambda', 0.70, 1), 
        'subsample': hp.uniform('subsample', 0.3, 0.6), 
        'max_bin':hp.choice('max_bin', range(150, 250, 5)), 
        'learning_rate': hp.loguniform('learning_rate', np.log(0.005), np.log(0.2))
}

In [None]:
trial = Trials()
best = fmin(f, space, algo=tpe.suggest, max_evals=20, trials=trial)

In [None]:
# only idx of best parameters could be achieved from best, so according to space, the values of best parameters could be found

params = {'colsample_bytree':0.2, 'learning_rate': 0.013636902671116896, 'max_bin': 85, 'min_child_samples': 205, 
          'n_estimators': 2000,'num_leaves': 35,'reg_alpha': 0.9579863172141052,'reg_lambda': 0.8783040346489164,
          'subsample': 0.5899650955658289,'subsample_freq': 2}

In [None]:
lgb = LGBMRegressor(**params)
lgb.fit(df_train, y)
sub = pd.DataFrame({'id': df_test['id'], 'loss': np.expm1(lgb.predict(df_test))})
sub.to_csv('sub.csv', index=False)

Learning Curve--to see whether it's ideal or not

In [None]:
train_size, train_score, test_score = learning_curve(LGBMRegressor(**params), df_train, y, n_jobs=-1)

In [None]:
train_mean = train_score.mean(axis=1)
train_std = train_score.std(axis=1)
test_mean = test_score.mean(axis=1)
test_std = test_score.std(axis=1)

plt.figure(figsize=(10, 8))
plt.plot(train_size, train_mean, 'o-', linewidth=3)
plt.fill_between(train_size, train_mean+train_std, train_mean-train_std, alpha=0.1)
plt.plot(train_size, test_mean, 'o-', linewidth=3)
plt.fill_between(train_size, test_mean+test_std, test_mean-test_std, alpha=0.1)
plt.title('Learning Curve', size=20)
plt.xlabel('Training Examples')
plt.ylabel('Score')

The curves look pretty reasonable