In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score , mean_squared_error

In [None]:
df = pd.read_csv("../input/health-insurance-dataset/Health_insurance.csv")
df.columns

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df[df['smoker']=='yes']

In [None]:
df.describe()

## Univariate analysis on 'age'

In [None]:
sns.distplot(df['age'])

# distribution does not looks like normal/gaussian distribution

In [None]:
df['age'].max()

# max age of the insurer is 64 which is quite commom

In [None]:
df.boxplot(column="age")

# looks like there are no outliers
# 50th percentile value is opprox half of the 100th percentile

## univariate analysis on 'bmi'

In [None]:
sns.distplot(df['bmi'])

# distribution looks like Normal/gaussian distribution

In [None]:
df.boxplot(column='bmi')

# looks like bmi has some outliers. Lets try the box plot for more clarity on outliers

In [None]:
df['bmi'].describe()

# not sure whether there are outliers or not
# try to compute percentile values

In [None]:
for i in range(0,101,2):
    print(f"BMI of {i}th percentile is :{np.percentile(df['bmi'], i)}")
    
# looks like there are outliers by checking the percentile values
# value difference for every 2nd percentile is lessthan or equal to 1
# but the there is huge difference between 98th and 100th percentile
# compute the percentile from 90th to 100

In [None]:
for i in range(90,101,1):
    print(f"BMI of {i}th percentile is :{np.percentile(df['bmi'], i)}")
    
# looks like 100th percentile is the outlier

### Calculating the boundry values for BMI

##### Assuming bmi follows a gaussian distribution we will calculate the boundaries which differentiates the outliers

In [None]:
upper_boundry = df['bmi'].mean() + 3 * df['bmi'].std()
lower_boundry = df['bmi'].mean() - 3 * df['bmi'].std()
print(f"Upper boundry value for BMI is:{round(upper_boundry)}")
print(f"Lower boundry value for BMI is: {round(lower_boundry)}")

### univariate analysis on smoker

In [None]:
smokers = df[df['smoker']=='yes']['charges'].values

non_smokers = df[df['smoker']=='no']['charges'].values
non_smokers

In [None]:
plt.figure(figsize=(10,3))
sns.distplot(smokers, hist=False, label="smokers")
sns.distplot(non_smokers, hist=False, label="non-smokers")
plt.title('charges for smokers and non-smokers')
plt.xlabel('Comparision of smokers and non-smokers charges')
plt.legend()
plt.show()

# from the graph we can clearly see that smokers paid very more charges than the non-smokers

### univariate analysis on sex

In [None]:
male_charges = df[df['sex'] == 'male']['charges'].values
female_charges = df[df['sex'] == 'female']['charges'].values

In [None]:
plt.figure(figsize=(10,3))
sns.distplot(male_charges, hist=False, label="male")
sns.distplot(female_charges, hist=False, label="female")
plt.title('charges for male and female')
plt.xlabel('Comparision of male and female charges')
plt.legend()
plt.show()

# charges for male and female are almost equal.

### univariate analysis on children

In [None]:
df['children'].unique()

In [None]:
zero_children = df[df['children'] == 0]['charges'].values
one_children = df[df['children'] == 1]['charges'].values
two_children = df[df['children'] == 2]['charges'].values
three_children = df[df['children'] == 3]['charges'].values
four_children = df[df['children'] == 4]['charges'].values
five_children = df[df['children'] == 5]['charges'].values


In [None]:
plt.figure(figsize=(10,3))
sns.distplot(zero_children, hist=False, label="zero")
sns.distplot(one_children, hist=False, label="one")
sns.distplot(three_children, hist=False, label="three")
sns.distplot(four_children, hist=False, label="four")
sns.distplot(five_children, hist=False, label="five")
plt.title('charges based on number of children')
plt.xlabel('Comparision of charges based on insurers number of children')
plt.legend()
plt.show()


# insurers having 5 children paid very less charges when compared to others.

### univariate analysis on region

In [None]:
df['region'].unique()

In [None]:
southwest = df[df['region'] == 'southwest']['charges'].values
southeast = df[df['region'] == 'southeast']['charges'].values
northwest = df[df['region'] == 'northwest']['charges'].values
northeast = df[df['region'] == 'northeast']['charges'].values


In [None]:
plt.figure(figsize=(10,3))
sns.distplot(southeast, hist=False, label="southeast")
sns.distplot(southwest, hist=False, label="southwest")
sns.distplot(northwest, hist=False, label="northwest")
sns.distplot(northeast, hist=False, label="northeast")
plt.title('charges based on region of insurer')
plt.xlabel('Comparision of region based on insurers region')
plt.legend()
plt.show()

# looks like charges doesnot depend on the region

### overall assumption on Univariate analysis

##### Categorical Features like region, children sex does not influence charges much
##### Categarical Features like smoking influences charges more
1. Insurer having smoking habit paid more charges than non smoker

#### Looks like in bmi there are outliers

<br><br><br><br><br><br><br><br><br><br><br>

### Multivariate analysis

In [None]:
sns.pairplot(df, hue='smoker')

# based on smoker or not we see there is a good relation among features age, bmi, children, smoker

##### 1. As the age increases charges increases liniarly
##### 2. for charges lessthan 10000 Non smokers paid less insurance where as smokers with same age paid morethan 30,000
##### 3. In the range of 10000 to 20000 smokers and non smokers are clubed.

##### 4. As the bmi increses charges for smokers and non-smokers liniarly

In [None]:
sns.pairplot(df, hue='sex')



In [None]:
sns.lmplot(x='age', y='charges', hue='smoker', data=df);

# charges increases for both smokers and non smokers linearly

In [None]:
sns.lmplot(x='bmi', y='charges', hue='smoker', data=df);
# As BMI increases charges increases rapidly for smokers

### Remove outliers

In [None]:
data = df.copy()

data.loc[data['bmi']>49, 'bmi'] = 49

In [None]:
# let's examine how many columns we we after one hot encoding

# pd.get_dummies(df, drop_first=True)
one_hot_df = pd.get_dummies(df)
one_hot_df.shape

In [None]:
X, y = one_hot_df.drop('charges', axis=1), one_hot_df.charges
print(X.shape, y.shape)

In [None]:
x_train , x_test , y_train , y_test = train_test_split(X, y, test_size = 0.2 , random_state = 51)

In [None]:
print(" Shape of x_train = ", x_train.shape)
print(" Shape of x_test = ", x_test.shape)
print(" Shape of y_train = ", y_train.shape)
print(" Shape of y_test = ", y_test.shape)

In [None]:
sc = StandardScaler()
sc.fit(x_train)
x_train = sc.transform(x_train)
x_test = sc.transform(x_test)

x_train

In [None]:
lr = LinearRegression() 
lr.fit(x_train, y_train)

In [None]:
print("Intercept : " , lr.intercept_)
print("Slope : " , lr.coef_)

In [None]:
y_pred = lr.predict(x_test)
y_pred_df = pd.DataFrame(y_pred, columns=["Predicted Values" ])
y_test_df = pd.DataFrame(np.array(y_test), columns=["Real Values"])
pd.concat([y_test_df , y_pred_df] , axis=1)

In [None]:
from sklearn.metrics import r2_score
accuracy = r2_score(y_test, y_pred)
accuracy