In [64]:
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from sklearn import linear_model
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import Ridge, RidgeCV, LassoCV

In [65]:
full_data = pd.read_csv("Feature_Engineered_Dataset_Nov17.csv")
full_data.shape

(17352, 36)

In [66]:
full_data = full_data.drop(["Unnamed: 0"], axis=1)
full_data.head()

Unnamed: 0,Vehicle_Usage_Farm,Vehicle_Usage_Pleasure,Vehicle_Symbol,Vehicle_Make_Year,Vehicle_Performance_High,Vehicle_Performance_Sports,Vehicle_Anti_Theft_Device_Alarm Only,Vehicle_Anti_Theft_Device_Passive Disabling-Vehicle Recovery,Vehicle_Passive_restraint_Y,Vehicle_Physical_Damage_Limit,...,EEA_Multi_Auto_Policies_IndicatorY,EEA_Agency_TypePreferred,EEA_Packaged_Policy_IndicatorY,EEA_Full_Coverage_IndicatorN,Annual_Premium,Loss_Amount,Vehicle_Symbol_max,Vehicle_Symbol_min,Vehicle_Make_Year_max,Vehicle_Make_Year_min
0,87,533,11.0,1998.0,12,4,65,310,746,46174955,...,869,271,468,371,347525.47,980597.39,26,1,2007,1947
1,96,561,11.0,1998.0,18,0,84,336,754,48849957,...,894,247,508,363,351081.67,1475654.08,25,3,2007,1967
2,94,534,11.0,1999.0,13,1,69,324,731,49174961,...,884,269,493,338,355515.92,818847.38,25,1,2007,1965
3,95,513,11.0,1998.0,15,1,76,317,732,49059972,...,882,251,470,391,336650.79,1192576.47,24,1,2007,1942
4,88,555,12.0,1999.0,15,3,83,340,738,46424957,...,889,235,486,356,350637.67,1112127.21,27,1,2007,1955


In [135]:
# Shuffle the data before spliting
full_data=full_data.sample(frac=1, random_state=20).reset_index(drop=True)
full_data.shape

(17352, 35)

In [136]:
full_data["Loss_Amount"].tail()

17347     361346.02
17348    4085405.05
17349      90597.53
17350     328440.54
17351    4796704.24
Name: Loss_Amount, dtype: float64

In [137]:
train = full_data[:-1]
print(train.shape)
y_train = pd.DataFrame(train["Loss_Amount"])
print(y_train.shape)
train = train.drop(["Loss_Amount"], axis=1)
print(train.shape)

(17351, 35)
(17351, 1)
(17351, 34)


In [138]:
test = full_data[-1:]
print(test.shape)
y_test = pd.DataFrame(test["Loss_Amount"])
print(y_test.shape)
test = test.drop(["Loss_Amount"], axis=1)
print(test.shape)

(1, 35)
(1, 1)
(1, 34)


In [139]:
#Training Data
train_premium = train["Annual_Premium"].sum()
train_loss = y_train["Loss_Amount"].sum()
print("Train Annual Premium = ", train_premium)
print("Train Loss Amount = ", train_loss)
train_loss_ratio = (train_loss/train_premium)
print("Train Loss Ratio = ", train_loss_ratio)
print("Train Loss log Ratio = ", math.log(train_loss_ratio))

# Testing Data
test_premium = test["Annual_Premium"].sum()
test_loss = y_test["Loss_Amount"].sum()
print("Test Annual Premium = ", test_premium)
print("Test Loss Amount = ", test_loss)
test_loss_ratio = (test_loss/test_premium)
print("Test Loss Ratio = ", test_loss_ratio)
print("Test Loss log Ratio = ", math.log(test_loss_ratio))

Train Annual Premium =  17363088452.25
Train Loss Amount =  12185866290.5
Train Loss Ratio =  0.7018259639701883
Train Loss log Ratio =  -0.35406982026460493
Test Annual Premium =  1748638.25
Test Loss Amount =  4796704.24
Test Loss Ratio =  2.7431083816220996
Test Loss log Ratio =  1.0090917232500212


In [140]:
# Scaling the training Dataset
# We have train, y_train, test and y_test

scaler = preprocessing.StandardScaler().fit(train)
train = scaler.transform(train)

# scale test dataset
test = scaler.transform(test)

print('Means = ', scaler.mean_)
print('Standard Deviations = ', scaler.scale_)

Means =  [2.75186041e+02 1.59868693e+03 1.10497090e+01 1.99813567e+03
 3.78666936e+01 7.60054176e+00 2.23938102e+02 9.37614835e+02
 2.19309244e+03 1.43259032e+08 3.33538885e+03 1.73207325e+03
 7.21244136e+02 2.23311337e+02 3.32255057e+03 1.21058729e+02
 1.36146043e+02 7.93945652e+02 2.89939312e+02 1.42754884e+02
 1.23984381e+02 4.12628667e+01 3.57377673e+02 1.18657524e+03
 1.87315601e+02 2.63546349e+03 7.68080111e+02 1.40382191e+03
 1.15261345e+03 1.00069670e+06 2.59318771e+01 8.80006916e-01
 2.00714869e+03 1.94765547e+03]
Standard Deviations =  [1.27592087e+03 7.37440700e+03 2.14036305e-01 3.58149738e-01
 1.72443650e+02 3.53369190e+01 1.02331295e+03 4.24637733e+03
 1.00125208e+04 6.59029461e+08 1.53473564e+04 7.92013832e+03
 3.28092836e+03 1.01302811e+03 1.52884213e+04 5.41792807e+02
 6.17807165e+02 3.65232070e+03 1.31079936e+03 6.50736055e+02
 5.55890057e+02 1.85630391e+02 1.60958900e+03 5.41045127e+03
 8.58350035e+02 1.21462862e+04 3.53697922e+03 6.41348695e+03
 5.43152745e+03 4.540

## Modeling and Predictions

In [160]:
l1_cv = LassoCV(cv=50, max_iter=10000)

#100 Regularization coefficients evenly spaced between 0.1 and 1000
l1_cv.alphas = tuple(np.linspace(0.1,1000,100))

l1_cv.fit(train, y_train)

  y = column_or_1d(y, warn=True)


LassoCV(alphas=(0.1, 10.2, 20.3, 30.4, 40.5, 50.6, 60.699999999999996, 70.8, 80.89999999999999, 90.99999999999999, 101.1, 111.19999999999999, 121.29999999999998, 131.39999999999998, 141.5, 151.6, 161.7, 171.79999999999998, 181.89999999999998, 192.0, 202.1, 212.2, 222.29999999999998, 232.39999999999998, 242.....8, 888.9, 899.0, 909.1, 919.2, 929.3, 939.4, 949.5, 959.6, 969.6999999999999, 979.8, 989.9, 1000.0),
    copy_X=True, cv=50, eps=0.001, fit_intercept=True, max_iter=10000,
    n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [161]:
res = l1_cv.predict(test)

In [162]:
res_pred = l1_cv.predict(train)
train_loss - res_pred.sum()

0.0

In [163]:
res.sum()

4798057.604141111

In [164]:
test_loss

4796704.24

In [165]:
test_loss - res.sum()

-1353.3641411103308

In [166]:
math.log((res.sum()/test_premium))

1.0093738280426299

In [141]:
l2_cv = RidgeCV(cv=None, store_cv_values=True)

#100 Regularization coefficients evenly spaced between 0.1 and 1000
l2_cv.alphas = tuple(np.linspace(0.1,10000,100))
l2_cv.fit(train, y_train)

RidgeCV(alphas=(0.1, 101.1090909090909, 202.1181818181818, 303.1272727272727, 404.1363636363636, 505.1454545454545, 606.1545454545454, 707.1636363636363, 808.1727272727272, 909.1818181818181, 1010.190909090909, 1111.1999999999998, 1212.2090909090907, 1313.2181818181816, 1414.2272727272725, 1515.236363636363...4.954545454546, 9595.963636363636, 9696.972727272727, 9797.981818181817, 9898.990909090908, 10000.0),
    cv=None, fit_intercept=True, gcv_mode=None, normalize=False,
    scoring=None, store_cv_values=True)

In [142]:
res_l2 = l2_cv.predict(test)

In [143]:
res_l2.sum()

4750205.335147528

In [144]:
test_loss - res_l2.sum()

46498.904852472246

In [145]:
math.log((res_l2.sum()/test_premium))

0.99935050339636

In [146]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.2, gamma=0.0, 
                             learning_rate=0.05, max_depth=6, 
                             min_child_weight=1.5, n_estimators=7200,
                             reg_alpha=0.9, reg_lambda=0.6,
                             subsample=0.2,seed=42, silent=1,
                             random_state =7)

In [147]:
model_xgb.fit(train,y_train)

KeyboardInterrupt: 

In [24]:
res_xgb = model_xgb.predict(test)

In [25]:
res_xgb.sum()

48327.84

In [26]:
test_loss - res_xgb.sum()

168613.36015625

In [27]:
math.log((res_xgb.sum()/test_premium))

-3.523579157073891

In [148]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [149]:
model_lgb.fit(train,y_train)

  y = column_or_1d(y, warn=True)


LGBMRegressor(bagging_fraction=0.8, bagging_freq=5, bagging_seed=9,
       boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
       feature_fraction=0.2319, feature_fraction_seed=9,
       importance_type='split', learning_rate=0.05, max_bin=55,
       max_depth=-1, min_child_samples=20, min_child_weight=0.001,
       min_data_in_leaf=6, min_split_gain=0.0, min_sum_hessian_in_leaf=11,
       n_estimators=720, n_jobs=-1, num_leaves=5, objective='regression',
       random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
       subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [150]:
res_lgb = model_lgb.predict(test)

In [151]:
res_lgb.sum()

3044209.723071189

In [152]:
test_loss - res_lgb.sum()

1752494.516928811

In [153]:
math.log((res_lgb.sum()/test_premium))

0.554403992656315

In [154]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [155]:
GBoost.fit(train,y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.05, loss='huber', max_depth=4,
             max_features='sqrt', max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=15, min_samples_split=10,
             min_weight_fraction_leaf=0.0, n_estimators=3000,
             presort='auto', random_state=5, subsample=1.0, verbose=0,
             warm_start=False)

In [156]:
res_gboost = GBoost.predict(test)

In [157]:
res_gboost.sum()

3162810.6103648366

In [158]:
test_loss - res_gboost.sum()

1633893.6296351636

In [159]:
math.log((res_gboost.sum()/test_premium))

0.5926237237788684

In [100]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [None]:
KRR.fit(train,y_train)



In [None]:
res_KRR = KRR.predict(test)

In [None]:
res_KRR.sum()

In [None]:
test_loss - res_KRR.sum()

In [None]:
math.log((res_KRR.sum()/test_premium))

In [40]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [41]:
ENet.fit(train,y_train)



Pipeline(memory=None,
     steps=[('robustscaler', RobustScaler(copy=True, quantile_range=(25.0, 75.0), with_centering=True,
       with_scaling=True)), ('elasticnet', ElasticNet(alpha=0.0005, copy_X=True, fit_intercept=True, l1_ratio=0.9,
      max_iter=1000, normalize=False, positive=False, precompute=False,
      random_state=3, selection='cyclic', tol=0.0001, warm_start=False))])

In [42]:
res_ENet = ENet.predict(test)

In [43]:
res_ENet.sum()

-859877.0967846594

In [44]:
test_loss - res_ENet.sum()

1076818.2967846594

In [45]:
math.log((res_ENet.sum()/test_premium))

ValueError: math domain error

In [None]:
Average = (res_ENet.sum()+res_KRR.sum()+res_gboost.sum()+res_lgb.sum()+res_xgb.sum()+res_l2.sum()+res.sum())/7

In [None]:
math.log(test_loss_ratio)

In [None]:
math.log((Average/test_premium))

In [None]:
Average-test_loss

In [None]:
a = np.array([90,20,30])
b = np.array([40,50,60])
sum = a + b
sum = sum/2
print(sum)
sum = sum/np.array([2,2,2])
print(sum)

[5,7,9]