In [None]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as pre
from sklearn.tree import DecisionTreeRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import mean_squared_error
from catboost import CatBoostRegressor
import time

## Data Loading

In [None]:
#loading the train,test and sample datasets
train = pd.read_csv("../input/ml-lab-exam/traindata_SJC.csv")
test = pd.read_csv("../input/ml-lab-exam/testdata_SJC.csv")

In [None]:
print("Train data :",train.shape)
print("Test data :",test.shape)

## Data preprocessing 

In [None]:
train.info()

### Missing value imputation

In [None]:
print(train.isnull().sum())
train.isnull().sum().plot.bar()

In [None]:
print(test.isnull().sum())
test.isnull().sum().plot.bar()

In [None]:
train.MaritalStatus.value_counts()

In [None]:
# Replace the null values with Unknown 
train['MaritalStatus'].fillna('U',inplace = True)
test['MaritalStatus'].fillna('U',inplace = True)

In [None]:
# Replace the null values with Mean
train['WeeklyWages'].fillna(train['WeeklyWages'].mean(),inplace = True)
test['WeeklyWages'].fillna(test['WeeklyWages'].mean(),inplace = True)

In [None]:
# Replace the null values with Mean
train['HoursWorkedPerWeek'].fillna(train['HoursWorkedPerWeek'].mean(),inplace = True)

In [None]:
print(test.isnull().sum())

In [None]:
#data=train.sample(1000)
data=train

In [None]:
data_test=test

In [None]:
data.shape

### Data transformation

In [None]:
#Transform the data into datetime format
data['DateTimeOfAccident']=pd.to_datetime(data['DateTimeOfAccident'])
data_test['DateTimeOfAccident']=pd.to_datetime(data_test['DateTimeOfAccident'])

In [None]:
data['DateReported']=pd.to_datetime(data['DateReported'])
data_test['DateReported']=pd.to_datetime(data_test['DateReported'])

In [None]:
data.head()

In [None]:
data['Yearofaccident']=data.DateTimeOfAccident.dt.year
data_test['Yearofaccident']=data_test.DateTimeOfAccident.dt.year

In [None]:
data['Hourofaccident']=data.DateTimeOfAccident.dt.hour
data_test['Hourofaccident']=data_test.DateTimeOfAccident.dt.hour

In [None]:
# Split DateTimeOfAccident into Day,Month,Year
data['Acc_Day'] = data['DateTimeOfAccident'].dt.dayofweek

data['Acc_Month'] = data['DateTimeOfAccident'].dt.month

data['Acc_Year'] = data['DateTimeOfAccident'].dt.year

data_test['Acc_Day'] = data_test['DateTimeOfAccident'].dt.dayofweek

data_test['Acc_Month'] = data_test['DateTimeOfAccident'].dt.month

data_test['Acc_Year'] = data_test['DateTimeOfAccident'].dt.year

In [None]:
# calculte DayOfReportedDelay by subtract the DateReported and DateTimeOfAccident
data['DayOfReportedDelay']=data.DateReported-data.DateTimeOfAccident
data_test['DayOfReportedDelay']=data_test.DateReported-data.DateTimeOfAccident

In [None]:
data['DayOfReportedDelay']=data['DayOfReportedDelay'].astype('<m8[D]')
data_test['DayOfReportedDelay']=data_test['DayOfReportedDelay'].astype('<m8[D]')

In [None]:
data['DayOfReportedDelay']=data['DayOfReportedDelay'].astype('int64')
data_test['DayOfReportedDelay']=data_test['DayOfReportedDelay'].astype('int64')

In [None]:
data.info()

In [None]:
data.head()

## EDA

#### Univariate

In [None]:
sns.set(style="darkgrid")
fig, ax = plt.subplots(6, 3, figsize = (18, 13))
sns.boxplot(x= data["UltimateIncurredClaimCost"], ax = ax[0,0])
sns.distplot(data['UltimateIncurredClaimCost'], ax = ax[0,1])
sns.scatterplot(data.index,data['UltimateIncurredClaimCost'],ax=ax[0,2])
sns.boxplot(x= data["InitialIncurredCalimsCost"], ax = ax[1,0])
sns.distplot(data['InitialIncurredCalimsCost'], ax = ax[1,1])
sns.scatterplot(data.index,data['InitialIncurredCalimsCost'],ax=ax[1,2])
sns.boxplot(x= data["Age"], ax = ax[2,0])
sns.distplot(data['Age'], ax = ax[2,1])
sns.scatterplot(data.index,data['Age'],ax=ax[2,2])
sns.boxplot(x= data["WeeklyWages"], ax = ax[3,0])
sns.distplot(data['WeeklyWages'], ax = ax[3,1])
sns.scatterplot(data.index,data['WeeklyWages'],ax=ax[3,2])
sns.boxplot(x= data["HoursWorkedPerWeek"], ax = ax[4,0])
sns.distplot(data['HoursWorkedPerWeek'], ax = ax[4,1])
sns.scatterplot(data.index,data['HoursWorkedPerWeek'],ax=ax[4,2])
sns.boxplot(x= data["Hourofaccident"],ax= ax[5,0])
sns.distplot(data['Hourofaccident'], ax = ax[5,1])
sns.scatterplot(data.index,data['Hourofaccident'],ax=ax[5,2])

plt.tight_layout()

In [None]:
data.MaritalStatus.value_counts().plot.pie(autopct="%.1f%%")

In [None]:
data.Gender.value_counts().plot.pie(autopct="%.1f%%")

In [None]:
data.PartTimeFullTime.value_counts().plot.pie(autopct="%.1f%%")

In [None]:
fig = plt.figure(figsize = (15,10))

ax1 = fig.add_subplot(2,3,1)
sns.countplot(data = data, x = 'Acc_Day')
ax2 = fig.add_subplot(2,3,2)
sns.countplot(data = data, x = 'Acc_Month')
ax3 = fig.add_subplot(2,3,3)
sns.countplot(data = data, x = 'Acc_Year')
plt.xticks(rotation=90)

#### Bivariate

In [None]:
data.select_dtypes(include=('object')).columns

In [None]:
fig = plt.figure(figsize = (15,10))

ax1 = fig.add_subplot(2,3,1)
sns.countplot(data = data, x = 'Gender', ax=ax1)

ax2 = fig.add_subplot(2,3,2)
sns.countplot(data = data, x = 'MaritalStatus', ax=ax2)

ax3 = fig.add_subplot(2,3,3)
sns.countplot(data = data, x = 'PartTimeFullTime', ax=ax3)

ax4 = fig.add_subplot(2,3,4)
plt.ylim(10, 200000)
sns.boxplot(data = data, x = 'Gender', y = 'UltimateIncurredClaimCost' , ax=ax4)

ax5 = fig.add_subplot(2,3,5)
plt.ylim(10, 200000)
sns.boxplot(data = data, x = 'MaritalStatus', y = 'UltimateIncurredClaimCost', ax=ax5)

ax6 = fig.add_subplot(2,3,6)
plt.ylim(10, 200000)
sns.boxplot(data = data, x = 'PartTimeFullTime', y = 'UltimateIncurredClaimCost', ax=ax6)

In [None]:
#data1=data.drop(data[data.UltimateIncurredClaimCost==data.UltimateIncurredClaimCost[data.Gender=='F'].max()].index)

In [None]:
sns.scatterplot(data.Hourofaccident,data.UltimateIncurredClaimCost)

In [None]:
fig = plt.figure(figsize=(15,5))
sns.barplot(data.Yearofaccident,data.UltimateIncurredClaimCost)

In [None]:
print('Correlation :',data.DayOfReportedDelay.corr(data.UltimateIncurredClaimCost))
sns.scatterplot(data.DayOfReportedDelay,data.UltimateIncurredClaimCost)

In [None]:
print('Correlation :',data.InitialIncurredCalimsCost.corr(data.UltimateIncurredClaimCost))
sns.scatterplot(data.InitialIncurredCalimsCost,data.UltimateIncurredClaimCost)

In [None]:
sns.scatterplot(data.HoursWorkedPerWeek,data.WeeklyWages,hue=data.PartTimeFullTime)

In [None]:
print('Correlation :',data.WeeklyWages.corr(data.UltimateIncurredClaimCost))
sns.scatterplot(data.WeeklyWages,data.UltimateIncurredClaimCost,hue=data.PartTimeFullTime)

In [None]:
fig = plt.figure(figsize = (20,5))

ax1 = fig.add_subplot(1,2,1)
sns.countplot('DependentChildren',data=data,ax=ax1)
ax2 = fig.add_subplot(1,2,2)
sns.countplot('DependentsOther',data=data,ax=ax2)

#### Multivariate 

In [None]:
data.columns

In [None]:
sns.pairplot(data[['Age','Gender',
       'DependentChildren', 'DependentsOther', 'WeeklyWages', 'HoursWorkedPerWeek', 'DaysWorkedPerWeek','UltimateIncurredClaimCost']],hue='Gender')

In [None]:
sns.pairplot(data[['Hourofaccident','Gender',
       'DayOfReportedDelay', 'Acc_Day', 'Acc_Month', 'Acc_Year', 'InitialIncurredCalimsCost',
       'UltimateIncurredClaimCost']],hue='Gender')

#### Correlation

In [None]:
sns.set(style="darkgrid")
fig = plt.figure(figsize=(15,5))
sns.heatmap(data.corr(), annot=True)

### Outlier imputation

InitialIncurredCalimsCost,HoursWorkedPerWeek,WeeklyWages contains some Outliers so using log transformation remove that ,

In [None]:
# Transform the InitialIncurredCalimsCost into the log scale
plt.figure(figsize = (14, 7))
plt.subplot(1, 2, 1)
sns.distplot(data['InitialIncurredCalimsCost'])
plt.subplot(1, 2, 2)
plt.title('Log Scale')
sns.distplot(np.log1p(data['InitialIncurredCalimsCost']))
data['InitialIncurredCalimsCost']=np.log1p(data['InitialIncurredCalimsCost'])

In [None]:
#Here, There are 1 entries with high HoursWorkedPerWeek, so drop that row
print(data.HoursWorkedPerWeek.describe())
data.HoursWorkedPerWeek.plot.box()

In [None]:
print(f"There are {data[data['HoursWorkedPerWeek'] >= 400.000].__len__()} entries with HoursWorkedPerWeek\n")
data[data['HoursWorkedPerWeek'] >= 400.00].head(2)

In [None]:
data.drop([4653,6113], inplace = True)

In [None]:
# Transform the InitialIncurredCalimsCost into the log scale
plt.figure(figsize = (14, 7))
plt.subplot(1, 2, 1)
sns.distplot(data['HoursWorkedPerWeek'])
plt.subplot(1, 2, 2)
plt.title('Log Scale')
sns.distplot(np.log1p(data['HoursWorkedPerWeek']))
data['HoursWorkedPerWeek']=np.log1p(data['HoursWorkedPerWeek'])
data_test['HoursWorkedPerWeek']=np.log1p(data_test['HoursWorkedPerWeek'])

In [None]:
# Transform the InitialIncurredCalimsCost into the log scale
plt.figure(figsize = (14, 7))
plt.subplot(1, 2, 1)
sns.distplot(data['WeeklyWages'])
plt.subplot(1, 2, 2)
plt.title('Log Scale')
sns.distplot(np.log1p(data['WeeklyWages']))
data['WeeklyWages']=np.log1p(data['WeeklyWages'])
data_test['WeeklyWages']=np.log1p(data_test['WeeklyWages'])

## Data Normalization

In [None]:
#dummy_train = pd.get_dummies(data.drop(['ClaimDescription', 'ClaimNumber', 'DateTimeOfAccident', 'DateReported', 'InitialIncurredCalimsCost'], axis = 1))

In [None]:
le=pre.LabelEncoder()

In [None]:
for x in data.select_dtypes(include=('object','category')).columns:
    data[x]=le.fit_transform(data[x])
for x in data_test.select_dtypes(include=('object','category')).columns:
    data_test[x]=le.fit_transform(data_test[x])

In [None]:
data.columns

In [None]:
data_clean=data[['Age', 'Gender',
       'MaritalStatus', 'DependentChildren', 'DependentsOther', 'WeeklyWages',
       'PartTimeFullTime', 'HoursWorkedPerWeek', 'DaysWorkedPerWeek',
       'ClaimDescription', 'InitialIncurredCalimsCost',
       'UltimateIncurredClaimCost', 'Yearofaccident', 'Hourofaccident',
       'DayOfReportedDelay', 'Acc_Day', 'Acc_Month', 'Acc_Year']]
data_clean_test=data_test[['Age', 'Gender',
       'MaritalStatus', 'DependentChildren', 'DependentsOther', 'WeeklyWages',
       'PartTimeFullTime', 'HoursWorkedPerWeek', 'DaysWorkedPerWeek',
       'ClaimDescription', 'InitialIncurredCalimsCost',
        'Yearofaccident', 'Hourofaccident',
       'DayOfReportedDelay', 'Acc_Day', 'Acc_Month', 'Acc_Year']]

In [None]:
#data_clean.UltimateIncurredClaimCost>data_clean.UltimateIncurredClaimCost.mean+data_clean.UltimateIncurredClaimCost.std

In [None]:
np.quantile(data_clean['UltimateIncurredClaimCost'],0.80)

In [None]:
# Here all the values above  80% is considered as out liar, so remove that values
#data_clean=data_clean[data_clean.UltimateIncurredClaimCost<np.quantile(data_clean['UltimateIncurredClaimCost'],0.80)]

In [None]:
data_clean

In [None]:
# Getting the output variable
y = data_clean['UltimateIncurredClaimCost']
# Getting the input variables
df_test = data_clean.drop(['UltimateIncurredClaimCost'], axis=1)
mix_max=pre.minmax_scale
X=mix_max(df_test.values)
X_test=mix_max(data_clean_test.values)

X=pd.DataFrame(X,columns=df_test.columns.tolist())
X_clean_test=pd.DataFrame(X_test,columns=data_clean_test.columns.tolist())


In [None]:
X_clean_test

## Feature selection

In [None]:
data_clean.columns

In [None]:
# Diving our input and output into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
                                    X, y, 
                                    test_size=0.33, 
                                    random_state=42
                                   )

In [None]:
# CatBoostRegressor (Default values)
tic = time.time()
CGB = CatBoostRegressor(logging_level='Silent')    
CGB.fit(X_train, y_train)
print("time (sec):" + "%6.0f" % (time.time() - tic))
 
# Validation MSE
result = mean_squared_error(y_test, CGB.predict(X_test))
#result = mean_squared_error(np.expm1(y_test), np.expm1(CGB.predict(X_test)))
print("MSE:" + "%6.2f" % result)

In [None]:
(pd.Series(CGB.feature_importances_, index=X_test.columns).nlargest(20).plot(kind='barh'))
plt.show()

In [None]:
X=X[['Age', 'Gender',
       'MaritalStatus', 'DependentChildren', 'WeeklyWages',
       'HoursWorkedPerWeek', 'DaysWorkedPerWeek',
       'ClaimDescription', 'InitialIncurredCalimsCost',
       'Yearofaccident', 'Hourofaccident',
       'DayOfReportedDelay', 'Acc_Day', 'Acc_Month', 'Acc_Year']]

## Modeling

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()

In [None]:
# print('#_____________________Linear___________________# ')
# lr.fit(X_train,y_train)
# y_pred=lr.predict(X_test)
# print(lr.score(X_train,y_train))
# print(lr.score(X_test, y_test))
# print(np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
# predlin=lr.predict(X_clean_test)

In [None]:
from mlxtend.regressor import StackingCVRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

xgb = XGBRegressor()
lgbm = LGBMRegressor()
rf = RandomForestRegressor()
ridge = Ridge()
lasso = Lasso()
svr = SVR(kernel='linear')

In [None]:
# print('#_____________________Ridge___________________# ')
# ridge = Ridge()
# ridge.fit(X_train,y_train)
# print("Train data :",ridge.score(X_train,y_train))
# print("Test data :",ridge.score(X_test,y_test))
# pred = ridge.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
# print('#_____________________Lasso___________________# ')
# lasso = Lasso()
# lasso.fit(X_train,y_train)
# print("Train data :",lasso.score(X_train,y_train))
# print("Test data :",lasso.score(X_test,y_test))
# pred = lasso.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
# print('#_____________________SVR___________________# ')
# svr = SVR(kernel='linear')
# svr.fit(X_train,y_train)
# print("Train data :",svr.score(X_train,y_train))
# print("Test data :",svr.score(X_test,y_test))
# pred = svr.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
# print('#_____________________Random Forest___________________# ')
# rf = RandomForestRegressor()
# rf.fit(X_train,y_train)
# print("Train data :",rf.score(X_train,y_train))
# print("Test data :",rf.score(X_test,y_test))
# pred = rf.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
# predrf = rf.predict(X_clean_test)

In [None]:
# print('#__________________XG Booster______________________# ')
# xgb = XGBRegressor()
# xgb.fit(X_train,y_train)
# print("Train data :",xgb.score(X_train,y_train))
# print("Test data :",xgb.score(X_test,y_test))
# pred = xgb.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
# pred1 = xgb.predict(X_clean_test)

In [None]:
# print('#__________________Light Booster______________________# ')
# lgbm = LGBMRegressor()
# lgbm.fit(X_train,y_train)
# print("Train data :",lgbm.score(X_train,y_train))
# print("Test data :",lgbm.score(X_test,y_test))
# pred = lgbm.predict(X_test)
# print(np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
# predict=lgbm.predict(X_clean_test)

In [None]:
stack = StackingCVRegressor(regressors=(ridge, lasso, rf, lgbm, xgb),
                            meta_regressor=xgb, cv=12,
                            use_features_in_secondary=True,
                            store_train_meta_features=True,
                            shuffle=False,
                            random_state=42)

stack.fit(X_train, y_train)

In [None]:
print('#__________________Stacking______________________# ')
print("Train data :",stack.score(X_train,y_train))
print("Test data :",stack.score(X_test,y_test))
pred = stack.predict(X_test)
print(np.sqrt(mean_squared_error(y_test, pred)))

In [None]:
X_clean_test

In [None]:
pred_stack = stack.predict(X_clean_test)

As compare to other model Stacking gives highest value and optimum RMSE

In [None]:
sub=pd.read_csv('../input/ml-lab-exam/sample_submission_csv.csv')
sub['UltimateIncurredClaimCost'] = pred_stack
sub.to_csv('submission_linear.csv', index = False)
sub.head(5)