## PREPROCESSING : MISSING VALUE TREATMENT

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

In [None]:
data = pd.read_csv("../input/mlexam/traindata_SJC.csv")
data_test=pd.read_csv("../input/mlexam/testdata_SJC.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data_test.info()

In [None]:
data['MaritalStatus'].fillna(data['MaritalStatus'].mode()[0],inplace = True)
data["WeeklyWages"].fillna(data["WeeklyWages"].mean(),inplace = True)
data["HoursWorkedPerWeek"].fillna(data["WeeklyWages"].mean(),inplace = True)
data_test['MaritalStatus'].fillna(data_test['MaritalStatus'].mode()[0],inplace = True)

In [None]:
data=data.drop(['DateReported','DateTimeOfAccident'],axis=1)
data_test=data_test.drop(['DateReported','DateTimeOfAccident'],axis=1)

In [None]:
data.info()

## CHECKING FOR MISSING VALUES

In [None]:
data.isnull()

In [None]:
data.info()

In [None]:
data.isnull().sum().count()

In [None]:
data_test.isnull().sum().count()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

## DATA VISUALISATION

In [None]:
numdf=data.select_dtypes(exclude=object)
numdf.columns

In [None]:
sns.boxplot(data=numdf.drop(["InitialIncurredCalimsCost","UltimateIncurredClaimCost"],axis=1))
plt.xticks(rotation=90)
plt.rcParams["figure.figsize"] = [10,10]

In [None]:
data_test['WeeklyWages']=pd.cut(data['WeeklyWages'],bins=3, labels=['Low','Medium','High']).astype('object')
data_test['Age']=pd.cut(data['Age'],bins=3, labels=['Low','Medium','High']).astype('object')
data['WeeklyWages']=pd.cut(data['WeeklyWages'],bins=3, labels=['Low','Medium','High']).astype('object')
data['Age']=pd.cut(data['Age'],bins=3, labels=['Low','Medium','High']).astype('object')

In [None]:
sns.heatmap(numdf.corr())

In [None]:
sns.pairplot(data=numdf,diag_kind='kde')

In [None]:
data.info()

In [None]:
data["Gender"].value_counts().plot.bar()
plt.rcParams["figure.figsize"] = [5,5]

In [None]:
data["MaritalStatus"].value_counts().plot.bar()
plt.rcParams["figure.figsize"] = [5,5]

In [None]:
sns.catplot(data=data,col="MaritalStatus",y='UltimateIncurredClaimCost',kind="violin")

In [None]:
sns.catplot(data=data,col="Gender",y='UltimateIncurredClaimCost',kind="violin")

## DATA ENCODING 

In [None]:
import sklearn.preprocessing as pre

In [None]:
le=pre.LabelEncoder()
sc=pre.StandardScaler()

In [None]:
df_label=data.copy()
df_label.shape
df_label_test=data_test.copy()

In [None]:
cat_df=data.select_dtypes(include="object")
cat_df1=data_test.select_dtypes(include="object")

In [None]:
cat_df

In [None]:
for x in cat_df.columns:
    df_label[x]=le.fit_transform(df_label[x])

In [None]:
for x in cat_df.columns:
    df_label_test[x]=le.fit_transform(df_label_test[x])

In [None]:
df_label.shape

In [None]:
df_label

## FEATURE SELECTION

In [None]:
import sklearn.preprocessing as pre
import sklearn.model_selection as ms

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.inspection import permutation_importance
import shap
from matplotlib import pyplot as plt

In [None]:
# df_label1=df_label.drop(["ClaimNumber","ClaimDescription","UltimateIncurredClaimCost","HoursWorkedPerWeek"],axis=1)
# X = pre.minmax_scale(df_label1)
# y= df_label['UltimateIncurredClaimCost']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=123456)

In [None]:
df_label_test1=df_label_test.drop(["ClaimNumber","ClaimDescription","HoursWorkedPerWeek"],axis=1)
X_test = pre.minmax_scale(df_label_test1)


In [None]:
df_label1=df_label.drop(["ClaimNumber","ClaimDescription","UltimateIncurredClaimCost","HoursWorkedPerWeek"],axis=1)
X_train = pre.minmax_scale(df_label1)
y_train= df_label['UltimateIncurredClaimCost']

In [None]:
dtr = DecisionTreeRegressor( random_state=123456)
dtr.fit(X_train, y_train)

In [None]:
importance=dtr.feature_importances_
importance

In [None]:
for i,v in enumerate(importance):
    print("Feature: %0d, Score : %0.5f" %(i,v))

In [None]:
sorted_idx = dtr.feature_importances_.argsort()
plt.bar(df_label1.columns[sorted_idx], dtr.feature_importances_[sorted_idx])
plt.xlabel("CART Classification Feature Importance")
plt.xticks(rotation=90)
plt.ylim(0,0.01)
plt.tick_params(axis='x', which='major', labelsize=12,length=10)
plt.rcParams["figure.figsize"] = [20,20]

In [None]:
list_feature=df_label1.columns[sorted_idx].tolist()
list_feature

## HYPER PARAMETER TUNING AND MODELLING

In [None]:
import sklearn.preprocessing as pre
import sklearn.model_selection as ms
import sklearn.linear_model as lm
import xgboost as xg
from sklearn.ensemble import RandomForestRegressor 
from sklearn.metrics import mean_squared_error

In [None]:
xgb= xg.XGBRegressor(max_depth=8,min_child_weight=20,eta=0.8,subsample=1,colsample_bytree=1,objective='reg:linear',n_jobs=4,random_state=123,n_estimators=5)

In [None]:
lasso=lm.Lasso(alpha=0.9, fit_intercept=True, normalize=False, precompute=False, copy_X=True, max_iter=10000, tol=0.0001, warm_start=False, positive=False, random_state=12345, selection='random')


In [None]:
ridge=lm.Ridge(alpha=1.0, fit_intercept=True, normalize=False, copy_X=True, max_iter=None, tol=0.001, solver='auto', random_state=None)

In [None]:
rfr=RandomForestRegressor(n_estimators=10,random_state=1234)

In [None]:
#lasso
lasso.fit(X_train,y_train)
#  test_score=lasso.score(X_test,y_test)
train_score=lasso.score(X_train,y_train)
y_pred_lasso=lasso.predict(X_test) 
# print("\nThe train score is:" ,train_score)

In [None]:
#xgboost
xgb.fit(X_train,y_train)
#test_score=xgb.score(X_test,y_test)
train_score=xgb.score(X_train,y_train)
y_pred_xgb=xgb.predict(X_test)
#print("\nThe test score is:" ,test_score)
print("\nThe train score is:" ,train_score)

In [None]:
#random forest
rfr.fit(X_train,y_train)
#test_score=xgb.score(X_test,y_test)
train_score=rfr.score(X_train,y_train)
y_pred_rfr=rfr.predict(X_test)
#print("\nThe test score is:" ,test_score)
print("\nThe train score is:" ,train_score)

In [None]:
#ridge
ridge.fit(X_train,y_train)
#test_score=xgb.score(X_test,y_test)
train_score=ridge.score(X_train,y_train)
y_pred_ridge=ridge.predict(X_test)
#print("\nThe test score is:" ,test_score)
print("\nThe train score is:" ,train_score)

In [None]:
#grid search for linear regression hyper parameter tuning

In [None]:
lr1=lm.LinearRegression()

In [None]:
 params = {"fit_intercept":[True,False],
    "normalize":[True,False],
    "copy_X":[True,False],
    "n_jobs":[2,3,4,5,6,7,8]}
params

In [None]:
grid=ms.GridSearchCV(lr1,params)

In [None]:
grid.fit(X_train,y_train)

In [None]:
pd.DataFrame(grid.cv_results_)

In [None]:
grid.best_params_

In [None]:
grid.best_score_

In [None]:
lr=lm.LinearRegression(copy_X=True,fit_intercept=True,n_jobs=2,normalize=False)

In [None]:
lr.fit(X_train,y_train)
# test_score=lr.score(X_test,y_test)
train_score=lr.score(X_train,y_train)
y_pred_lr=lr.predict(X_test)
#print("\nThe test score is:" ,test_score)
print("\nThe train score is:" ,train_score)

In [None]:
df_csv=pd.read_csv("../input/mlexam/sample_submission_csv.csv")
df_csv["UltimateIncurredClaimCost"]=y_pred_xgb
df_csv.to_csv("Sample Submission_xgb.csv", index = False)