In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Data Loading

In [None]:
data=pd.read_csv('../input/ml-lab-exam/traindata_SJC.csv')

In [None]:
data.info()

In [None]:
data.head()

# Pre processing 

In [None]:
#our initial data was miss aligned, remaned some cells and droped the unwanted rows
df=data.rename(columns={"Unnamed: 0":"ClaimNumber","Unnamed: 1":"DateTimeOfAccident","Unnamed: 3":"Age","Unnamed: 4":"Gender","Unnamed: 5":"MaritalStatus","Unnamed: 6":"DependentChildren","Unnamed: 8":"WeeklyWages","Unnamed: 9":"PartTimeFullTime","Unnamed: 10":"HoursWorkedPerWeek","Unnamed: 12":"ClaimDescription","Unnamed: 13":"InitialIncurredCalimsCost","Unnamed: 14":'UltimateIncurredClaimCost'},inplace=False)
df=df.drop([0,1])

In [None]:
df.info()

### Converting object values to numeric

In [None]:
df['InitialIncurredCalimsCost'] = pd.to_numeric(df['InitialIncurredCalimsCost'],errors = 'coerce')

In [None]:
df['UltimateIncurredClaimCost'] = pd.to_numeric(df['UltimateIncurredClaimCost'],errors = 'coerce')

In [None]:
df['DependentChildren'] = pd.to_numeric(df['DependentChildren'],errors = 'coerce')

In [None]:
df['Age'] = pd.to_numeric(df['Age'],errors = 'coerce')

In [None]:
df['HoursWorkedPerWeek'] = pd.to_numeric(df['HoursWorkedPerWeek'],errors = 'coerce')

In [None]:
df['WeeklyWages'] = pd.to_numeric(df['WeeklyWages'],errors = 'coerce')

In [None]:
import sklearn.preprocessing as pre
import sklearn.model_selection as ms

### Label encoding and outlier treatment

In [None]:
le=pre.LabelEncoder()

In [None]:
list_df=['Age','WeeklyWages','Gender','MaritalStatus','PartTimeFullTime','HoursWorkedPerWeek',
        'ClaimDescription']

In [None]:
for x in list_df:
    df[x]=le.fit_transform(df[x].astype(str))

In [None]:
#UltimateIncurredClaimCost is having high number of outliers, hence treated it using only values less than .80
df=df[df.UltimateIncurredClaimCost<np.quantile(df['UltimateIncurredClaimCost'],0.80)] 

In [None]:
df.info()

In [None]:
#finding and replacing missing values with mean value of weekly wages
df['WeeklyWages'].fillna(df['WeeklyWages'].mean(),inplace=True)


In [None]:
#finding and replacing missing values with s 
df['MaritalStatus']=df['MaritalStatus'].fillna("s")

In [None]:
##finding and replacing missing values with median value of hours wroked 
df['HoursWorkedPerWeek'].fillna(df['HoursWorkedPerWeek'].median(),inplace=True)

In [None]:
df.isnull().sum() # removed all NaN values

#### Ultimate and Initial claim cost are very right skewed hence we apply basic log transform (log(x+1))

In [None]:
print(df.skew()) ##finding the skew and observed these results

In [None]:
# The log1p function applies log(1+x) to all elements of the column
df["LogUltimateIncurredClaimCost"] = np.log1p(df["UltimateIncurredClaimCost"])
df["LogInitialIncurredCalimsCost"] = np.log1p(df["InitialIncurredCalimsCost"])

# After we converting it into log functions we try to plot distribution[claim costs (log)]
# plotting a Graph btw ultimate and initial claim cost 
plt.subplots(figsize=(10, 6))
sns.distplot(df.LogUltimateIncurredClaimCost, kde=False, label='Ultimate',bins=100,color='r')
sns.distplot(df.LogInitialIncurredCalimsCost, kde=False, label='Initial', bins=100,color='g')
plt.xlabel('claim costs (log)')
plt.legend()
plt.show()

# EDA

In [None]:
corrmat = df.corr() #finding the correlation 

# Draw the heatmap 
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, annot=True, square=True, cmap='plasma')
plt.show()

In [None]:
ax = plt.subplots(figsize=(14, 5))
sns.boxplot(x='Age', y='LogUltimateIncurredClaimCost', data=df)
plt.show()

In [None]:
ax = plt.subplots(figsize=(14, 5))
sns.boxplot(x='DependentChildren', y='LogUltimateIncurredClaimCost', data=df)
plt.show()

In [None]:
ax = plt.subplots(figsize=(14, 5))
sns.boxplot(x='DaysWorkedPerWeek', y='LogUltimateIncurredClaimCost', data=df)
plt.show()

In [None]:
df.columns

In [None]:
sns.boxplot(data=df,x='WeeklyWages')

In [None]:
sns.boxplot(data=df,x='HoursWorkedPerWeek')

In [None]:
sns.boxplot(data=df,x='PartTimeFullTime')

In [None]:
sns.boxplot(data=df,x='DaysWorkedPerWeek')

In [None]:
sns.boxplot(data=df,x='UltimateIncurredClaimCost')

In [None]:
sns.catplot(data=df,x='LogInitialIncurredCalimsCost',col='PartTimeFullTime',kind='violin')

In [None]:
sns.catplot(data=df,x='DaysWorkedPerWeek',col='MaritalStatus',kind='violin')

### It is clear from the above plots the amount of outliers are high 

In [None]:
sns.pairplot(df) #pair plot to better visualization of features 


In [None]:
# Generate a list of numerical variables, definig a function to accept all numeric variable and plot it.
num_list = [c for c in df.columns if((df[c].dtype != np.object) and not "Cost" in c)] 
# plot histograms
for name in num_list:
    f, ax = plt.subplots(figsize=(10, 5))
    nbins = min(df[name].value_counts().count(),70)
    plt.hist(data=df, x=name, bins=nbins)
    plt.xlabel(name)
    plt.show()


# Model creation

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
import time

In [None]:
df.columns

In [None]:
from sklearn.feature_selection import RFE

In [None]:
feature_cols=['Age', 'Gender', 'MaritalStatus','DependentChildren','WeeklyWages', 'PartTimeFullTime','InitialIncurredCalimsCost']
X=df[feature_cols]
y=df['UltimateIncurredClaimCost']
X.head()

In [None]:
print(X.shape)##printing the shape of X and y
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split ##splitting the data to test and train
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state =30)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
# standard scaling
from sklearn.preprocessing import StandardScaler
# creating a standard scaler
sc = StandardScaler()
# feeding independents sets into the standard scaler
x_train = sc.fit_transform(x_train)
x_test = sc.fit_transform(x_test)

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()

In [None]:
lr.fit(x_train,y_train)
y_pred=lr.predict(x_test)
print(lr.score(x_train,y_train))
print(lr.score(x_test, y_test))
print('RMSE :',np.sqrt(mean_squared_error(y_test, y_pred)))

### LGBMRegressor

In [None]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(
               objective = 'regression', 
               num_leaves = 4,
               learning_rate = 0.01, 
               n_estimators = 10000,
               max_bin = 200, 
               bagging_fraction = 0.75,
               bagging_freq = 5, 
               bagging_seed = 7,
               feature_fraction = 0.2,
               feature_fraction_seed = 7,
               verbose = 1,
            )

lgbm_model = lgbm.fit(x_train, y_train)
lg_vpreds = lgbm_model.predict(x_test)
print((f"LGBM RMSE: {np.sqrt(mean_squared_error(y_test, lg_vpreds))}"))

### XGBRegressor

In [None]:
xgb = XGBRegressor(
                    learning_rate = 0.01, 
                    n_estimators = 10000,
                    max_depth = 3, 
                    min_child_weight = 0,
                    gamma = 0, 
                    subsample = 0.7,
                    colsample_bytree = 0.7,
                    objective = 'reg:squarederror', 
                    nthread = 1,
                    scale_pos_weight = 1, 
                    seed = 27,
                    reg_alpha = 0.00006
                    )
xgb_model = xgb.fit(x_train, y_train)
xg_vpreds = xgb_model.predict(x_test)
print((f"XGBOOST RMSE: {np.sqrt(mean_squared_error(y_test, xg_vpreds))}"))


### Stacking togther linear regression and XGBRegressor

In [None]:
predictions = 0.5*(lr.predict(x_test)+xgb_model.predict(x_test))
df_test_pred = pd.DataFrame({'ClaimNumber':y_test,'UltimateIncurredClaimCost':predictions})
df_test_pred.to_csv('subm-v3-blend-LX.csv',index=False)
df_test_pred.head()

In [None]:
np.sqrt(mean_squared_error(y_test,predictions))

### Importing the test data and preprocessing it for testing

In [None]:
df_test=pd.read_csv('../input/ml-lab-exam/testdata_SJC.csv')

In [None]:
df_test.head()

In [None]:
df_test.shape

In [None]:
import sklearn.preprocessing as pre
import sklearn.model_selection as ms

In [None]:
le=pre.LabelEncoder()

In [None]:
list_df=['WeeklyWages','Gender','MaritalStatus','PartTimeFullTime','HoursWorkedPerWeek']

In [None]:
for x in list_df:
    df_test[x]=le.fit_transform(df_test[x].astype(str))

In [None]:
#finding and replacing missing values with mean value of weekly wages
df_test['WeeklyWages'].fillna(df_test['WeeklyWages'].mean(),inplace=True)


In [None]:
#finding and replacing missing values with s 
df_test['MaritalStatus']=df_test['MaritalStatus'].fillna("s")

In [None]:
##finding and replacing missing values with median value of hours wroked 
df_test['HoursWorkedPerWeek'].fillna(df_test['HoursWorkedPerWeek'].median(),inplace=True)

In [None]:
df_test.isnull().sum()

In [None]:
df_test["LogInitialIncurredCalimsCost"] = np.log1p(df_test["InitialIncurredCalimsCost"])

In [None]:
feature_cols=['Age', 'Gender', 'MaritalStatus','DependentChildren','WeeklyWages', 'PartTimeFullTime','InitialIncurredCalimsCost']
X1=df_test[feature_cols]

X1.head()

In [None]:
lr.predict(X1)

In [None]:
 xgb_model.predict(X1)

#### From building 3 different models found that simple linear regression tends to produce less RMSE value than other 2 models

In [None]:
sub=pd.read_csv('../input/ml-lab-exam/sample_submission_csv.csv')
sub['UltimateIncurredClaimCost'] = 0.5*(lr.predict(X1)+xgb_model.predict(X1))
sub.to_csv('Stacked.csv', index = False)
sub.head(5)
print(np.mean(sub['UltimateIncurredClaimCost']))