# Loading data and few libraries:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn
import numpy as np

In [None]:
df = pd.read_csv('../input/insurance/insurance.csv') # loading data into df

# Understanding the data:

## 1. Basic Information about the data:

In [None]:
df.head()

# from the data below we get to know that, there are 7 features, out of which 3 are categorical.


In [None]:
df.info()

# Points to be considered:

# 1. There are no NULL values
# 2. 3 of them are object type

In [None]:
df['region'].value_counts()

# There are 4 types of region, with approx same number of people.

In [None]:
print(f'mean age:  {np.mean(df.age)}')
print(f'minimum age: {min(df.age)}, maximum age: {max(df.age)}')

In [None]:
print(f'mean charges: {np.mean(df.charges)}')
print(f'minimum charges: {min(df.charges)}, maximum charges: {max(df.charges)}')

In [None]:
print(df['smoker'].value_counts()) 

# there are 1064 people who do not smoke, others smoke

In [None]:
print(df['children'].value_counts())

# here is the split between the number of children.. 

In [None]:
print(df['sex'].value_counts()) 

# Approx same number of male and female 

## 2. Encoding data:

So, as there are no NULL values, we can move ahead and encode the categorical features, as we can only use libraries on numerical
feautures.
This will be the first step in this case, before we dive into understanding the given data.



In [None]:
df1 = pd.get_dummies(df)
df1
# here we can use OneHotEncoding to encode the data as well.
# get_dummies is a class of pandas, which encodes the categorical data into numeric form.
# This method divides the categories into it's types, for eg: if we want to encode sex category which have 'male', and 'female' as its two
# categories, then get_dummies will create two new features in your dataframe -- as 'sex_male' and 'sex_female'

In [None]:
# But we for now we do not require extra features like 'sex_male', as we can use 'sex_female' only to get relevant information 
# So, we can drop one of the extra feature created by the get_dummies class.

df1 = pd.get_dummies(df, drop_first = True)
df1.head()

## 3. Analysing outliers:

In [None]:
df1['charges'].describe()

In [None]:
df1['charges'].quantile(0.95) # maximum charge is around 64k, while 95% of the data is less than 41k...
# so we can assume that there are some outliers, but major i

In [None]:
plt.boxplot(df1[ 'charges']) # lots of outliers

In [None]:
df1['age'].describe()

In [None]:
df1['age'].quantile(0.95) # 75% of the data is less than 51 years, and 95% of people are less than 62 years..

# maximum age of a person in the given data is 64,,, so there is no such need to remove any of the age,, or to consider any age as a outlier.


In [None]:
plt.boxplot(df1['age']) #there are approx no outliers

In [None]:
df1['bmi'].describe()

In [None]:
df1['bmi'].quantile(0.95) # 95% of people are less than 41,, while maximum bmi = 53.13

In [None]:
plt.boxplot(df1['bmi']) # there are few outliers

In [None]:
# Removing outliers:
from numpy import percentile
q05, q95 = percentile(df['charges'], 25), percentile(df['charges'], 75)
iqr = q95 - q05

cut_off = iqr * 1.5
lower, upper = q05 - cut_off, q95 + cut_off
outliers = [x for x in df['charges'] if x < lower or x > upper]
outliers_removed = [x for x in df['charges'] if x > lower and x < upper]

print(len(outliers))

# we know that, for the given data, 75% percentile has significant values of charges, because major change in charges comes after 80-90% of percentile.
# so if we check in the range of 5-95 % percentile -->

from numpy import percentile
q05, q95 = percentile(df['charges'], 5), percentile(df['charges'], 95)
iqr = q95 - q05

cut_off = iqr * 1.5
lower, upper = q05 - cut_off, q95 + cut_off
outliers = [x for x in df['charges'] if x < lower or x > upper]
outliers_removed = [x for x in df['charges'] if x > lower and x < upper]

print(len(outliers))

# In the above range we find no outliers, so we can say that, if we consider 25-75% percentile as the base parameter, then there are around 10% of outliers, which may not change the 
# significane of the data in large, as the number of outliers are quite low. 
# Also, we find quite significant values between 25-75% percentile, so it will not be a good idea to choose this range, and if we check other range values, then the number of 
# outliers decreases.

## 3. Standardising the data:

In [None]:
# as we can see that, charges values are quite large, which can deviate our results, as large value can be give higher prefence. 
# So, we need scale data, so that the above situation does not occur.

from sklearn.preprocessing import StandardScaler

# Here, I'll be using StandardScaler which is a class of sklearn

scaler = StandardScaler() # creating an object of StandardScaler class
df2 = pd.DataFrame(scaler.fit_transform(df1), columns = df1.columns) # first fitting the dataframe to scaler, and then transforming into the standardarised form

# storing this standardarised form in df2
# as fit_transform gives us an array of scaled features, therefore to convert that array into dataframe format we use, pandas library pd.DataFrame().

df2.head()

## 4. Visualizing the data:

Here we will try to understand the data, visually and try to find any correlation between the data. We will see whether there are outliers or not, and how these outliers effect the data.


In [None]:
# We know that, Linear Regression performs better if the distribution of the data is gaussian. 
# Therefore, we will be heading to see the distribution of each feature..

import scipy.stats as stat # importing scipy library and accessing stats class
import pylab
# creating a function, to plot a normal distribution 

def check_dist(df, feature):       # It takes 2 parameters, one is the dataframe and other is the feature
    plt.figure(figsize = (10,6))   # giving size to the figure
    plt.subplot(1,2,1)             # I want to plots in parllel, so (1,2,1) --> 1st row, 2nd column and 1st index  
    sn.distplot(df[feature])           # creating a distribution plot 
    plt.subplot(1,2,2)             # 1st row, 2nd column and 2nd index
    stat.probplot(df[feature], dist = 'norm', plot = pylab) # This plot tells us that whether the given feature will be of normal/gaussian form or not 
    plt.show()
    
    

In [None]:
print('Age:')
print(check_dist(df2,'age')) # almost gaussian

In [None]:
print('BMI:')
print(check_dist(df2,'bmi')) # Fully Gaussian

In [None]:
print('Charges:')
print(check_dist(df2,'charges')) # It is right skewed, so we can convert it into gaussian form

In [None]:
# Converting/Transforming 'Charges' distribution into gaussian form.

df2['new_charges']= df.charges**(1/2)
df2.head() # new_charges column has been created, which transformed into gaussian distribution

In [None]:
check_dist(df2, 'new_charges')

In [None]:
del(df2['charges']) # deleting charges coloumn

In [None]:
df2.head() # charges --> column have been removed

### Now the current data, is scaled + follows gaussian distribution + no categorical features + no missing values + outliers -> analysed



In [None]:
f1 = ['age', 'bmi', 'children', 'sex_male', 'smoker_yes', 'new_charges']
f2 = ['region_northwest' ,'region_southwest', 'region_southeast', 'new_charges']
sn.heatmap(df2[f1].corr(), annot = True)


# Smokers and charges are the most correlated parameters followed by age and charges

In [None]:
sn.heatmap(df2[f2].corr(), annot = True)

# Not much relation between the charges and the region.

In [None]:
features = ['age', 'bmi', 'children']
sn.pairplot(df[features])

# Model Creation:

## 1. Splitting the data:

In [None]:
df2.head()

In [None]:
x = df2.iloc[:, : -1]
y = df2.iloc[:, -1]

In [None]:
x.head() # all columns other than new_charges

In [None]:
y.head() # only new_charges column

In [None]:
# splliting the data into test and train:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

print(f'Size of Training set: {len(x_train), len(y_train)}')
print(f'Size of Test set {len(x_test), len(y_test)}')


## 2. Applying model:

In [None]:
from sklearn.linear_model import LinearRegression

linear = LinearRegression()

In [None]:
linear.fit(x_train, y_train)

In [None]:
linear.score(x_test, y_test) #R^2 value... which means that most of the features in the test dataset was able to predict the variation occuring in the 'new_charges'.

In [None]:
linear.coef_

In [None]:
linear.intercept_

### Therefore, from the above data we can write the equation of the line as --->

#### y = 104.82 + 19.59(age) + 6.33(bmi) + 3.74(children) + (-0.51)(sex_male) + 36.55(smoker_yes) + (-0.86)(region_northwest) + (-2.42)(region_southeast) + (-1.93)(region_southwest)

In [None]:
linear.score(x_train, y_train) # so in training set, 76% of the new_charges are being explained by our parameters, while 81% of the new_charges are being explained by the
                               # parameters.
                               # This suggests that our model is not overfitted to the training data.

In [None]:
# Now lets find out the significance of data cleaning:

# Significance of data cleaning:

In [None]:
# In this I'll be using 'df1' dataframe for this analysis.. using df1 and not df because, df1 is encoded, whereas df is not encoded.
# Therefore,

df1.head()

In [None]:
# splitting the data:

features = ['age', 'bmi', 'children', 'sex_male', 'smoker_yes', 'region_northwest', 'region_southeast', 'region_southwest']
x1 = df1[features]
y1 = df1['charges']

In [None]:
x1.head()

In [None]:
y1.head()


In [None]:
x1_train, x1_test, y1_train, y1_test = train_test_split(x1, y1, test_size = 0.2, random_state = 0)

In [None]:
linear2 = LinearRegression()     # fitting data to a linear model
linear2.fit(x1_train, y1_train)

In [None]:
linear2.score(x1_test, y1_test)  # So, 79% of the charges was explained by the parameters in the test set.

In [None]:
linear2.score(x1_train, y1_train) # So, 73% of the charges was explained by the parameters in the test set.

### Clearly, cleaned data performed well as compared to uncleaned data.