In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

 # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Import the Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

## Get the data from the file

In [None]:
df = pd.read_csv('../input/insurance-premium-prediction/insurance.csv')
df.head(5)

## Check for Missing or null values

In [None]:
df.isnull().sum()

## Describe the Statistics

In [None]:
df.describe()

## Get the total male and female in the dataset

In [None]:
sns.set_style('whitegrid')
sns.countplot(x='sex',data=df)

## Get the total no of people across the regions

In [None]:
plt.figure(figsize=(16,6))
plt.style.use('fivethirtyeight')
ax=sns.countplot('region',data=df,palette='dark',)
ax.set_xlabel(xlabel='Type',fontsize=18)
ax.set_ylabel(ylabel='Total people in Region',fontsize=18)
ax.set_title(label='Region',fontsize=20)
plt.show()

## The relation between Expenses and BMI which is distinguished by Smoker

In [None]:
sns.relplot(x='expenses',y='bmi',data=df,hue='smoker')

## The relation between Expenses and BMI which is distinguished by Smoker and categorised by Gender

In [None]:
sns.relplot(x='expenses',y='bmi',hue='smoker',col='sex',data=df)

## Catplot of Region Vs Expenses

In [None]:
g=sns.catplot(x='region',y='expenses',data=df)
g.set_xticklabels(rotation=45)

## Catplot of Region Vs Expenses categorised by Gender

In [None]:
g=sns.catplot(x='region',y='expenses',hue='sex',data=df,kind='box')
g.set_xticklabels(rotation=45)

## Catplot with kind as Violin and Region Vs Expenses

In [None]:
sns.catplot(x='region',y='expenses',kind='violin',data=df,inner=None)

## Get the count of people in different Age Groups

In [None]:
Age_Group_0_10=0
Age_Group_11_20=0
Age_Group_21_30=0
Age_Group_31_40=0
Age_Group_41_50=0
Age_Group_51_60=0
Age_Group_61_70=0

for i in range(len(df)):
    if df.loc[i,'age']>0 and df.loc[i,'age']<=10:
        Age_Group_0_10 +=1
    elif df.loc[i,'age']>10 and df.loc[i,'age']<=20:
        Age_Group_11_20 +=1
    elif df.loc[i,'age']>20 and df.loc[i,'age']<=30:
        Age_Group_21_30 +=1
    elif df.loc[i,'age']>30 and df.loc[i,'age']<=40:
        Age_Group_31_40 +=1        
    elif df.loc[i,'age']>40 and df.loc[i,'age']<=50:
        Age_Group_41_50 +=1  
    elif df.loc[i,'age']>50 and df.loc[i,'age']<=60:
        Age_Group_51_60 +=1    
    else:
        Age_Group_61_70 +=1

## Create a Dicitionary of Age Group and count as values

In [None]:
Age_dict = {'Age_Group':['Age(0-10)','Age(11-20)','Age(21-30)','(Age31-40)',
            'Age(41-50)','Age(51-60)','Age(61-70)'],
            'Count':[Age_Group_0_10,Age_Group_11_20,Age_Group_21_30,
                    Age_Group_31_40,Age_Group_41_50,Age_Group_51_60,
                    Age_Group_61_70]
           }

## Create a Dataframe of AgeGroup Vs Count

In [None]:
df2 = pd.DataFrame(Age_dict)
df2

## Piechart of percentage Distribution of Age-Group and count

In [None]:
plt.figure(figsize=(20,20))
plt.title("Pie Chart of Age Group Distribution opting for Insurance Premium",fontsize=30)
plt.pie(df2['Count'],labels=df2['Age_Group'],autopct='%1.1f%%',wedgeprops={'edgecolor':'black'})
plt.show()

## Determine the Categorical features

In [None]:
categorical_features=[feature for feature in df.columns if df[feature].dtypes=='O']
categorical_features

## Determine the Numercial features

In [None]:
numerical_features=[feature for feature in df.columns if df[feature].dtypes!='O']
numerical_features

## Total Categories in individual Categorical features

In [None]:
for feature in categorical_features:
    print('The feature is {} and number of categories are {}'.format(feature,len(df[feature].unique())))

## Transform the Categorical features into Labels

In [None]:
lbl_encoders={}
for feature in categorical_features:
    lbl_encoders[feature]=LabelEncoder()
    df[feature]=lbl_encoders[feature].fit_transform(df[feature])

In [None]:
df

## Split the Dependent and Independent features

In [None]:
X = df.loc[:,['age','sex','bmi','children','smoker','region']]
Y = df.loc[:,'expenses']

## Split the dataset into training and Test Data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.05)

## Perform Standardisation

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

## Find the best parameters using GridSearchCV

In [None]:
model = DecisionTreeRegressor()

gs = GridSearchCV(model,
                  param_grid = {'max_depth': range(1, 11),
                                'min_samples_split': range(10, 60, 10)},
                  cv=5,
                  n_jobs=1,
                  scoring='neg_mean_squared_error')

gs.fit(X_train, y_train)

print(gs.best_params_)

## Fit the Decision Tree model with best parameters on training Sample

In [None]:
regressor = DecisionTreeRegressor(max_depth=4, min_samples_split=20)
regressor.fit(X_train, y_train)

## Predict the Test samples

In [None]:
y_pred = regressor.predict(X_test)

## Calculate the accuracy using r2_score

In [None]:
r2_score(y_test,y_pred)