In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

###### 

# Data Loading

In [None]:
df = pd.read_csv('../input/insurance-prediction/train_SJC.csv')
df_test=pd.read_csv("../input/insurance-prediction/Test_SJC.csv")

In [None]:
df.head()

# Data Preprocessing

In [None]:
df1=df.rename(columns={"Unnamed: 0":"ClaimNumber","Unnamed: 1":"DateTimeOfAccident","Unnamed: 3":"Age","Unnamed: 4":"Gender","Unnamed: 5":"MaritalStatus","Unnamed: 6":"DependentChildren","Unnamed: 8":"WeeklyWages","Unnamed: 9":"PartTimeFullTime","Unnamed: 10":"HoursWorkedPerWeek","Unnamed: 12":"ClaimDescription","Unnamed: 13":"InitialIncurredCalimsCost","Unnamed: 14":'UltimateIncurredClaimCost'},inplace=False)

In [None]:
df2=df1.drop(df1.index[0])
df2.head()

In [None]:
df2.info()

In [None]:
df2.describe()

In [None]:
df2.isnull().sum()

In [None]:
plt.figure(figsize=(14,6))
x = df2.columns
y = df2.isnull().sum()
sns.barplot(x, y).set_title(" Missing Values")
ax = plt.gca()
for p in ax.patches:
    height = p.get_height()
    ax.text(p.get_x() + p.get_width()/ 2.0, height + 2,
           int(height), fontsize = 14, ha = 'center', va = 'bottom')

ax.set_xlabel("Columns")
ax.set_ylabel("Missing values count")
plt.xticks(rotation = 90)
plt.show()

In [None]:
df2['MaritalStatus'].value_counts()

In [None]:
df2['MaritalStatus'].mode()

In [None]:
df2['MaritalStatus']=df2['MaritalStatus'].fillna("S")

In [None]:
df2['WeeklyWages'].value_counts()

In [None]:
df2['HoursWorkedPerWeek'].value_counts()

In [None]:
df2 = df2.fillna(df2.median())

In [None]:
df2.isnull().sum()

In [None]:
df2.info()

In [None]:
df2['Gender'].value_counts()

In [None]:
df2['UltimateIncurredClaimCost']=df2['UltimateIncurredClaimCost'].astype(np.float64)
log_UltimateIncurredClaimCost=np.log(df2['UltimateIncurredClaimCost'])

In [None]:
df2.info()

In [None]:
df2['HoursWorkedPerWeek']=df2['HoursWorkedPerWeek'].astype(np.float64)
df2['InitialIncurredCalimsCost']=df2['InitialIncurredCalimsCost'].astype(np.float64)
df2['Age']=df2['Age'].astype(np.int32)
df2['WeeklyWages']=df2['WeeklyWages'].astype(np.float64)
df2['InitialIncurredCalimsCost']=df2['InitialIncurredCalimsCost'].astype(np.float64)
df2['DependentChildren']=df2['DependentChildren'].astype(np.float64)

In [None]:
df2=df2[df2.UltimateIncurredClaimCost<np.quantile(df2['UltimateIncurredClaimCost'],0.85)]

In [None]:
df2.info()

In [None]:
df2['DependentChildren'].value_counts()

In [None]:
sns.boxplot(data=df2,x='HoursWorkedPerWeek')

# EDA

In [None]:
df2.corr()

In [None]:
plt.figure(figsize=(20,15))
sns.barplot(x='DependentChildren',y='UltimateIncurredClaimCost',data=df2)
plt.show()

In [None]:
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(df2.corr(),cmap='YlGnBu',square=True,annot=True)
plt.show

In [None]:
df2['DependentChildren'].max(), df2['DependentChildren'].min(), df2['DependentChildren'].median()

In [None]:
sns.countplot(data=df2,hue='Gender',x='DependentChildren')

In [None]:
sns.catplot(data=df2,x='WeeklyWages',col='Gender',kind='violin')

In [None]:
plt.figure(figsize=(12,10))
sns.scatterplot(x='HoursWorkedPerWeek',y='UltimateIncurredClaimCost',data=df2)
plt.show()

In [None]:
df2['WeeklyWages']=np.log1p(df2['WeeklyWages'])

In [None]:
sns.catplot(data=df2,x='Age',col='Gender',kind='box')

In [None]:

sns.countplot(data=df2,x='Gender')

<h3>Number of Male is dominated here

In [None]:
plt.figure(figsize=(12,10))
sns.barplot(x='DependentsOther',y='UltimateIncurredClaimCost',data=df2)
plt.show()

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(x='Age',y='UltimateIncurredClaimCost',data=df2)
plt.xticks(rotation=90)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
# 0-Female , 1- Male
df2['Gender'] = le.fit_transform(df2['Gender'])

# 0 - Married  1-Single ,2-unknown.
df2['MaritalStatus'] = le.fit_transform(df2['MaritalStatus'])

# 0 -FullTime and 1 -partTime                                      
df2['PartTimeFullTime']=le.fit_transform(df2['PartTimeFullTime'])
df2['ClaimDescription']=le.fit_transform(df2['ClaimDescription'])


In [None]:
df2.head()

In [None]:
feature_cols=['Age', 'Gender','MaritalStatus','DependentChildren', 'DependentsOther', 'WeeklyWages',
       'PartTimeFullTime', 'HoursWorkedPerWeek', 'DaysWorkedPerWeek',
       'ClaimDescription', 'InitialIncurredCalimsCost']
X=df2[feature_cols]
y=df2['UltimateIncurredClaimCost']
X.head()

In [None]:
print(X.shape)
print(y.shape)

In [None]:
from sklearn.model_selection import train_test_split
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state =30)
print(X_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

# scaling

In [None]:

from sklearn.preprocessing import StandardScaler
# creating a standard scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
x_test = sc.fit_transform(x_test)

# Creating model

In [None]:
#reg.fit(X_train, y_train)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
lr=LinearRegression()

In [None]:
lr.fit(X_train,y_train)
y_pred=lr.predict(x_test)
print(lr.score(X_train,y_train))
print(lr.score(x_test, y_test))
print('RMSE :',np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
#ridge
from sklearn.linear_model import Ridge, Lasso
ridge = Ridge()
ridge.fit(X_train,y_train)
print("Train:",ridge.score(X_train,y_train))
print("Test:",ridge.score(x_test,y_test))
y_pred = ridge.predict(x_test)
print(np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
y_pred

In [None]:
pip install xgboost

In [None]:
from xgboost import XGBRegressor

In [None]:
Val = XGBRegressor(
                    learning_rate = 0.01,
                    n_estimators = 10000,
                    max_depth = 3,
                    min_child_weight = 0,
                    gamma = 0,
                    subsample = 0.7,
                    colsample_bytree = 0.7,
                    objective = 'reg:squarederror',
                    nthread = 1,
                    scale_pos_weight = 1,
                    seed = 27,
                    reg_alpha = 0.00006
                    )
xgb_model = Val.fit(X_train, y_train)
preds_n = xgb_model.predict(x_test)
print((f"RMSE: {np.sqrt(mean_squared_error(y_test, preds_n))}"))

In [None]:
df_test

In [None]:
df_test['MaritalStatus'].value_counts()

In [None]:
df_test.isnull().sum()

In [None]:
df_test['MaritalStatus'].mode()

In [None]:
df_test['MaritalStatus']=df_test['MaritalStatus'].fillna("S")

In [None]:
df_test.isnull().sum()

In [None]:
df_test['Gender'].value_counts()

In [None]:
df_test['HoursWorkedPerWeek']=df_test['HoursWorkedPerWeek'].astype(np.float64)
df_test['InitialIncurredCalimsCost']=df_test['InitialIncurredCalimsCost'].astype(np.float64)
df_test['Age']=df_test['Age'].astype(np.int32)
df_test['WeeklyWages']=df_test['WeeklyWages'].astype(np.float64)
df_test['InitialIncurredCalimsCost']=df_test['InitialIncurredCalimsCost'].astype(np.float64)
df_test['DependentChildren']=df_test['DependentChildren'].astype(np.float64)

In [None]:
df_test.info()

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
# 0-Female , 1- Male
df_test['Gender'] = le.fit_transform(df_test['Gender'])

# 0 - Married  1-Single ,2-unknown.
df_test['MaritalStatus'] = le.fit_transform(df_test['MaritalStatus'])

# 0 -FullTime and 1 -partTime                                      
df_test['PartTimeFullTime']=le.fit_transform(df_test['PartTimeFullTime'])
df_test['ClaimDescription']=le.fit_transform(df_test['ClaimDescription'])


In [None]:
cols=['Age', 'Gender','MaritalStatus','DependentChildren', 'DependentsOther', 'WeeklyWages',
       'PartTimeFullTime', 'HoursWorkedPerWeek', 'DaysWorkedPerWeek',
       'ClaimDescription', 'InitialIncurredCalimsCost']
X_new=df_test[cols]
X_new.head()

In [None]:
print(X_new.shape)

In [None]:
from sklearn.preprocessing import StandardScaler
# creating a standard scaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
x_test = sc.fit_transform(x_test)

In [None]:
y_pred_n=lr.predict(X_new)

In [None]:
y_pred_n

In [None]:
preds = xgb_model.predict(X_new)

In [None]:
preds

In [None]:
sub=pd.read_csv('../input/insurance-prediction/sample_submission.csv')
sub['UltimateIncurredClaimCost'] = preds
sub.to_csv('submission_linears.csv', index = False)
sub.head(5)
print(np.mean(sub['UltimateIncurredClaimCost']))