## Import libraries

In [None]:
import pandas as pd # data analytical library
import numpy as np #fast linear algebra
import matplotlib.pyplot as plt #visualization
import seaborn as sns #statistical visualization

## Reading the dataset

In [None]:
df=pd.read_csv("train_SJC.csv")

##### To display first 5 rows of the dataset

In [None]:
df.head()

##### Since the headers are not named in most cases, renaming it using dictionary

In [None]:
df=df.rename(columns={"Unnamed: 0":"ClaimNumber","Unnamed: 1":"DateTimeOfAccident","Unnamed: 3":"Age","Unnamed: 4":"Gender",
                      "Unnamed: 5":"MaritalStatus","Unnamed: 6":"DependentChildren","Unnamed: 8":"WeeklyWages",
                      "Unnamed: 9":"PartTimeFullTime","Unnamed: 10":"HoursWorkedPerWeek","Unnamed: 12":"ClaimDescription",
                      "Unnamed: 13":"InitialIncurredCalimsCost","Unnamed: 14":'UltimateIncurredClaimCost'},inplace=False)

##### Header column is repeated in the first row, hence remove it using drop function and display first 5 rows

In [None]:
df=df.drop(df.index[0])
df.head()

## Pre-processing

##### Analysing basic descriptions of the dataset

#### Descriptive statistics using describe function

In [None]:
df.describe() # displays only for numeric columns

#### To find the shape/size of the data

In [None]:
df.shape # the dataframe has 36176 rows and 17 columns

#### To find a concise summary of the dataframe using info function

In [None]:
df.info()

#### To check the datatypes of the features

In [None]:
df.dtypes # few numeric datatypes are classified as string

##### Converting some features that are classified as object data type to numeric for better analysis

In [None]:
df['Age'] = pd.to_numeric(df['Age'])
df['DependentChildren'] = pd.to_numeric(df['DependentChildren'])
df['WeeklyWages'] = df['WeeklyWages'].astype('float64')
df['HoursWorkedPerWeek'] = df['HoursWorkedPerWeek'].astype('float64')
df['InitialIncurredCalimsCost'] = pd.to_numeric(df['InitialIncurredCalimsCost'])
df['UltimateIncurredClaimCost'] = pd.to_numeric(df['UltimateIncurredClaimCost'])

##### To check if the datatypes are changed

In [None]:
df.dtypes

## Missing value analysis and treatment

### Finding which columns have missing values

In [None]:
df.isnull().sum()

#### checking the type of data of missing value columns

In [None]:
df['MaritalStatus'].value_counts() # categorical

In [None]:
df['WeeklyWages'].value_counts() # continuous

In [None]:
df['HoursWorkedPerWeek'].value_counts() # continuous

#### Treatment

##### Classifying the missing marital status as unknown

In [None]:
df['MaritalStatus'] = df['MaritalStatus'].fillna('U')

##### Performing mean imputation on HoursWorkedPerWeek(continuous) data

In [None]:
sns.distplot(df['HoursWorkedPerWeek']) # Mean becuase data is not skewed

In [None]:
df['HoursWorkedPerWeek']=df['HoursWorkedPerWeek'].fillna(df['HoursWorkedPerWeek'].mean())

##### Classifying the missing WeeklyWages with median

In [None]:
sns.distplot(df['WeeklyWages']) # Median because data is skewed slightly

In [None]:
df['WeeklyWages']=df['WeeklyWages'].fillna(df['WeeklyWages'].median())

##### Verify if there are no missing values

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

## Data Tranformation

#### Transforming DateTimeOfAccident to datetime datatype

In [None]:
df['DateTimeOfAccident']

In [None]:
df['DateTimeOfAccident']=pd.to_datetime(df['DateTimeOfAccident'])

In [None]:
df.dtypes 

##### Retaining only the year of accident from date-time

In [None]:
df['YearOfAccident']=pd.to_datetime(df['DateTimeOfAccident']).dt.year

In [None]:
df['YearOfAccident']

##### to check if the yearOfAccident column is created

In [None]:
df.head() 

#### Binning

##### Binning the age column

In [None]:
df['Age_Bin']=pd.cut(df['Age'],bins=3)
df['Age_Bin']

In [None]:
df['Age_Bin']=pd.cut(df['Age'],bins=[12,35,57,80] , labels=['Youth','Adult','Senior_Citizen'])
df['Age_Bin']

##### Biining InitialIncurredCalimsCost into Rankings based on the cost to show the value of it

In [None]:
df['Rating'] = pd.cut(df['InitialIncurredCalimsCost'],bins=[0,3500,7000,9500,12000,18000],labels=['1','2','3','4','5'])
df['Rating']

## Exploratory Data Analysis

#### Statistical Analysis

In [None]:
cat_data=df.select_dtypes(include=object)
cat_data.columns

In [None]:
num_data=df.select_dtypes(exclude=object)
num_data.columns

In [None]:
num_data.describe()

In [None]:
cat_data.describe()

In [None]:
df.apply(pd.Series.nunique)

### Univariate Analysis

##### Analysing the count of different discrete freautures using count plot

In [None]:
univariate=['Age_Bin','Gender','MaritalStatus','DependentChildren','DependentsOther','PartTimeFullTime','DaysWorkedPerWeek']
for col in univariate:
    plt.figure(figsize = (6,4))
    sns.countplot(x=col,data =df)

##### From the above plots we can infer the following
1. Youth have applied the most for insurance
2. Number of Males who applied for insurance are way higher than females
3. The marital status for most of the insurance holders are single
4. Most of them do not have children or dependencies
5. full time workers have applied more for the policy
6. Among those who applied, most of them work 5 days a week


##### Viewing the distribution of target variable using dist plot

In [None]:
df['UltimateIncurredClaimCost'].hist(bins=25)
plt.show()
sns.distplot(df['UltimateIncurredClaimCost'],color='g')
plt.show()
sns.boxplot(df['UltimateIncurredClaimCost'],color='r')

##### From the above 3 plots , namely histogram, distplot and boxplot we can infer the target variable is highly poitively skewed

### Bivariate Analysis

In [None]:
log_UltimateIncurredClaimCost=np.log(df['UltimateIncurredClaimCost'])

In [None]:
sns.boxplot(x = 'Gender', y = log_UltimateIncurredClaimCost,  data =df)

##### Average incurred claims cost female are the higer amount than male

Bar plots

In [None]:
sns.barplot(x='PartTimeFullTime',y='UltimateIncurredClaimCost',data=df)

##### Part time workers have got higher insurance

In [None]:
sns.barplot(x='DaysWorkedPerWeek',y='UltimateIncurredClaimCost',data=df)

##### people working 6 days a week have got higher insurance

In [None]:
sns.barplot(x='Age_Bin',y='UltimateIncurredClaimCost',data=df)

##### Senior citizens have got highest insurance, as the age increases the insurance also increases

In [None]:
plt.figure(figsize=(15,6))
sns.barplot(x='YearOfAccident',y='UltimateIncurredClaimCost',data=df)

##### As the year increases the amount given as insurance also increases

### Multivariate analysis

In [None]:
sns.pairplot(df)

In [None]:
corrmat = df.corr()
top_corr_features = corrmat.index
plt.figure(figsize=(20,20))
#plot heat map
g=sns.heatmap(df[top_corr_features].corr(),annot=True,cmap="RdYlGn")

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(df.corr()[['UltimateIncurredClaimCost']] ,annot = True)

##### From the above 3 plots we can infer that the most important feature responsible for predicting the ultimateIncurredClainCost is  InitiaIncurredClaimCost 

### Outlier Analysis and treatment

In [None]:
df.plot.box(figsize=(25,6))

In [None]:
for i in range(4):

    limit=3*df['InitialIncurredCalimsCost'].std()

    lower_limit=df['InitialIncurredCalimsCost'].mean()-limit
    upper_limit=df['InitialIncurredCalimsCost'].mean()+limit

    df=df[(df['InitialIncurredCalimsCost']>lower_limit)&(df['InitialIncurredCalimsCost']<upper_limit)]

    limit=3*df['UltimateIncurredClaimCost'].std()

    lower_limit=df['UltimateIncurredClaimCost'].mean()-limit
    upper_limit=df['UltimateIncurredClaimCost'].mean()+limit

    df=df[(df['UltimateIncurredClaimCost']>lower_limit)&(df['UltimateIncurredClaimCost']<upper_limit)]

In [None]:
df.plot.box(figsize=(25,6))

## Model Building

##### Separating the response and predictor variables

In [None]:
response=df['UltimateIncurredClaimCost']
response 

In [None]:
features=['ClaimNumber', 'DateReported', 'Age', 'Gender',
          'MaritalStatus', 'DependentChildren', 'DependentsOther', 'WeeklyWages',
          'PartTimeFullTime', 'HoursWorkedPerWeek', 'DaysWorkedPerWeek',
          'ClaimDescription', 'InitialIncurredCalimsCost']
pred=df[features]
pred

## Linear Regression

In [None]:
def linear_regression(data):
    
    from sklearn.linear_model import LinearRegression
    import sklearn.preprocessing as pre
    from sklearn.preprocessing import LabelEncoder
    label_en=LabelEncoder()
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_squared_error
    
    assignment_dict=[]
    
    X_scale=data.drop(['UltimateIncurredClaimCost'],axis='columns')
    cat_df=X_scale.select_dtypes(exclude=[float,int]).columns
    for i in cat_df :
        X_scale[str(i)]=label_en.fit_transform(X_scale[str(i)])
    X_scale=X_scale.apply(pre.minmax_scale)
    Y=data['UltimateIncurredClaimCost']
    
    
    X=X_scale.copy()
    
    n=len(X_scale.columns)
    
    upper_index=3
    lower_index=0
    
    while upper_index<=n:
        
        X_scale=X.drop(X.columns[lower_index:upper_index],axis='columns')
        x_train,x_test,y_train,y_test=train_test_split(X_scale,Y,test_size=0.3,random_state=1234456)
        dropped_cols=X.columns[lower_index:upper_index]
        
        glm=LinearRegression()
        glm.fit(x_train,y_train)
        y_pred=glm.predict(x_test)
        
        

        assignment_dict.append({

            'features':X_scale.columns,
            'Number of features':len(X_scale.columns),
            'dropped_features':dropped_cols,
            'train_score':glm.score(x_train,y_train),
            'test_score':glm.score(x_test,y_test),
            'rmse' : mean_squared_error(y_test, y_pred, squared=False)
        })

        
        
        upper_index+=3
        lower_index+=3
    
    dataframe=pd.DataFrame(assignment_dict)
    assignment=dataframe.style.set_caption("Train and test scores for different features")
    pd.set_option('display.max_colwidth', None)
    

    return assignment

linear_regression(df)

Choosing ['ClaimNumber', 'DateTimeOfAccident', 'DateReported', 'Age', 'Gender', 'MaritalStatus', 'DependentChildren', 'DependentsOther', 'WeeklyWages', 'ClaimDescription', 'InitialIncurredCalimsCost', 'YearOfAccident', 'Age_Bin'] gives better score of {0.686777,0.692278} for train and test scores with rmse value 1897.786500

## Random forest

In [None]:
def random_forest_regression(data):
    
    from sklearn.metrics import mean_squared_error
    from sklearn.ensemble import RandomForestRegressor
    import sklearn.preprocessing as pre
    from sklearn.preprocessing import LabelEncoder
    label_en=LabelEncoder()
    from sklearn.model_selection import train_test_split
    
    assignment_dict=[]
    
    X_scale=data.drop(['UltimateIncurredClaimCost'],axis='columns')
    cat_df=X_scale.select_dtypes(exclude=[float,int]).columns
    for i in cat_df :
        X_scale[str(i)]=label_en.fit_transform(X_scale[str(i)])
    X_scale=X_scale.apply(pre.minmax_scale)
    Y=data['UltimateIncurredClaimCost']
    
    
    X=X_scale.copy()
    
    n=len(X_scale.columns)
    
    upper_index=3
    lower_index=0
    
    while upper_index<=n:
        
        X_scale=X.drop(X.columns[lower_index:upper_index],axis='columns')
        x_train,x_test,y_train,y_test=train_test_split(X_scale,Y,test_size=0.3,random_state=1234456)
        dropped_cols=X.columns[lower_index:upper_index]
        
        rfr= RandomForestRegressor()
        rfr.fit(x_train,y_train)
        y_pred=rfr.predict(x_test)
        
        

        assignment_dict.append({

            'features':X_scale.columns,
            'Number of features':len(X_scale.columns),
            'dropped_features':dropped_cols,
            'train_score':rfr.score(x_train,y_train),
            'test_score':rfr.score(x_test,y_test),
            'rmse' : mean_squared_error(y_test, y_pred, squared=False)
        })

        
        
        upper_index+=3
        lower_index+=3
    
    dataframe=pd.DataFrame(assignment_dict)
    assignment=dataframe.style.set_caption("Train and test scores for different features")
    pd.set_option('display.max_colwidth', None)
    

    return assignment
random_forest_regression(df)

Choosing ['ClaimNumber', 'DateTimeOfAccident', 'DateReported', 'Age', 'Gender', 'MaritalStatus', 'DependentChildren', 'DependentsOther', 'WeeklyWages', 'ClaimDescription', 'InitialIncurredCalimsCost', 'YearOfAccident', 'Age_Bin'] gives better score of {0.963530,0.749708} for train and test scores with rmse value 1662.737423

# SVR

In [None]:
def support_vector_regression(data):
    
    from sklearn.svm import SVR
    import sklearn.preprocessing as pre
    from sklearn.preprocessing import LabelEncoder
    label_en=LabelEncoder()
    from sklearn.model_selection import train_test_split
    
    assignment_dict=[]
    
    X_scale=data.drop(['UltimateIncurredClaimCost'],axis='columns')
    cat_df=X_scale.select_dtypes(exclude=[float,int]).columns
    for i in cat_df :
        X_scale[str(i)]=label_en.fit_transform(X_scale[str(i)])
    X_scale=X_scale.apply(pre.minmax_scale)
    Y=data['UltimateIncurredClaimCost']
    
    
    X=X_scale.copy()
    
    n=len(X_scale.columns)
    
    upper_index=3
    lower_index=0
    
    while upper_index<=n:
        
        X_scale=X.drop(X.columns[lower_index:upper_index],axis='columns')
        x_train,x_test,y_train,y_test=train_test_split(X_scale,Y,test_size=0.3,random_state=1234456)
        dropped_cols=X.columns[lower_index:upper_index]
        
        regressor = SVR(kernel = 'rbf')
        regressor.fit(x_train, y_train)
        
        
        

        assignment_dict.append({

            'features':X_scale.columns,
            'Number of features':len(X_scale.columns),
            'dropped_features':dropped_cols,
            'train_score':regressor.score(x_train,y_train),
            'test_score':regressor.score(x_test,y_test)
        })

        
        
        upper_index+=3
        lower_index+=3
    
    dataframe=pd.DataFrame(assignment_dict)
    assignment=dataframe.style.set_caption("Train and test scores for different features")
    pd.set_option('display.max_colwidth', None)
    

    return assignment
support_vector_regression(df)

#### It gives very high RMSE value hence ignored

## To optimize the Random forest regressor

Chose Random Forest as it gives lower RMSE score

#### Encoding the categorical variables to numeric using LabelEncoder

In [None]:
import sklearn.preprocessing as pre
le=pre.LabelEncoder()

In [None]:
df_copy=df

In [None]:
for x in df.select_dtypes(include='object').columns:
    df[x]=le.fit_transform(df[x])


In [None]:
df.head()

In [None]:
response=df['UltimateIncurredClaimCost']
response 

### Feature importance

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

import matplotlib.pyplot as plt

model = ExtraTreesRegressor()

model.fit(pred,response)

In [None]:
print(model.feature_importances_)

feat_importances = pd.Series(model.feature_importances_, index=predictor.columns)

feat_importances.nlargest(8).plot(kind='barh')

plt.show()

In [None]:
imp_features=['ClaimNumber', 'DateReported', 'Age','MaritalStatus','WeeklyWages','HoursWorkedPerWeek', 
             'ClaimDescription', 'InitialIncurredCalimsCost']
predictor=df[imp_features]
predictor

### Min Max scaling

In [None]:
df_scale=pre.minmax_scale(predictor)
df_scale=pd.DataFrame(df_scale,columns=imp_features)
df_scale

##### Split train and test

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(predictor, response, test_size=0.3, random_state=0)

## Final Model

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rfr=RandomForestRegressor()

In [None]:
rfr.fit(x_train,y_train)
y_pred=rfr.predict(x_test)
y_pred

In [None]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred, squared=False)

In [None]:
# Randomized Search CV
#Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 1200, num = 12)]

#Number of features to consider at every split
max_features = ['auto', 'sqrt']

#Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(5, 30, num = 6)]

#Minimum number of samples required to split a node
min_samples_split = [2, 5, 10, 15, 100]

#Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 5, 10]

### Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf}

print(random_grid)

rf = RandomForestRegressor()

from sklearn.model_selection import RandomizedSearchCV

rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid,scoring='neg_mean_squared_error', n_iter = 5, cv = 5, verbose=2, random_state=42, n_jobs = 1)

### Fitting the model and finding best parameters and score

rf_random.fit(x_train,y_train)

rf_random.best_params_

rf_random.best_score_

The results given from considering the parameters obtained from the above Randomized search Cv were not better, hence ignored. 

# Reading and testing the built model on test data

In [None]:
test_data=pd.read_csv("Test_SJC.csv",usecols=imp_features)
test_data.head()

In [None]:
 test_data.isnull().sum()

In [None]:
test_data.dtypes

In [None]:
df_scale.head()

In [None]:
test_data.columns

In [None]:
 test_data['MaritalStatus'] = test_data['MaritalStatus'].fillna('U')

In [None]:
label_encoder=pre.LabelEncoder()
test_data['ClaimNumber']=label_encoder.fit_transform(test_data['ClaimNumber'])
test_data['ClaimNumber']=label_encoder.fit_transform(test_data['ClaimNumber'])
test_data['DateReported']=label_encoder.fit_transform(test_data['DateReported'])
test_data['MaritalStatus']=label_encoder.fit_transform(test_data['MaritalStatus'])
test_data['ClaimDescription']=label_encoder.fit_transform(test_data['ClaimDescription'])

In [None]:
def test_pre(data):
   
    import sklearn.preprocessing as pre
    from sklearn.preprocessing import minmax_scale
    label_encoder=pre.LabelEncoder()
    data=data.apply(minmax_scale)
    data['MaritalStatus']=label_encoder.fit_transform(data['MaritalStatus'])
    data['ClaimDescription']=label_encoder.fit_transform(data['ClaimDescription'])
    return data
    

In [None]:
test=test_pre(test_data)

In [None]:
y_pred=rfr.predict(test)

In [None]:
result=y_pred