In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import seaborn as sns
import os
import pandas as pd
import matplotlib.pyplot as plt

from sklearn import preprocessing, metrics, model_selection, linear_model, ensemble, tree, svm

In [None]:
## loadin the dataset
##/kaggle/input/insurance/insurance.csv
##df = pd.read_csv('insurance.csv')
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')
df.head()

In [None]:
## checking null values in the dataset
df.isna().sum()

### No Null values in the dataset

In [None]:
df.info()

In [None]:
## distinguishing the categorical and continuous variables

cat_var = []
con_var = []
for col in df.columns:
    if df[col].dtype == 'object':
        cat_var.append(col)
    else:
        con_var.append(col)

print("Categorical Varibales are:- \n",cat_var)
print("\nContinuous Varibales are:- \n",con_var)

        

In [None]:
for col in cat_var:
    print("Column Name:-",col)
    print("unique value(%)", str(round( df[col].nunique() * 100 / df[col].count() , 2 )))
    print("Unique Values are:-",list(df[col].unique()))
    print("-"*60)

In [None]:
## Basic EDA Starts here

### Smoking vs Sex

In [None]:
sns.countplot(df.smoker, hue = df.sex)
plt.show()

In [None]:
df.groupby(['sex'])['smoker'].value_counts(normalize=True)

In [None]:
plt.figure(figsize=(8,6))
sns.barplot(y = df.charges, x = df.region, hue = df.smoker, estimator=np.mean)
plt.show()

#### As per the above graph, we can see that people who smokes have more medical costage than non-smoker and they are basically from SouthEast region

In [None]:
## Impact of the number of children over smoking habits

sns.countplot(x = df.children, hue = df.smoker)
plt.show()

In [None]:
df.groupby(['children'])['smoker'].value_counts(normalize = True)

### smoking habits does not depend over the number of children

In [None]:
## Impact of smoking over charges

sns.distplot(df[df['smoker'] == 'yes'].charges, color = 'c')
plt.plot()

In [None]:

sns.distplot(df[df['smoker'] == 'no'].charges, color = 'c')
plt.plot()

### As we can see that most of the smoking people adhre charges more as compared to Non Smoking people

In [None]:
sns.scatterplot(x = df.age, y = df.charges, hue = df.smoker)
plt.show()

### As we can see that medical costage is increasing as the age is increasing and costage is always higher for smoking people as compared to non-smoking people

In [None]:
plt.figure(figsize=(6,6))
sns.violinplot(y = df.charges, x = df.region, hue = df.smoker, split=True)
plt.show()

In [None]:
## Distribution of the Age 

plt.figure(figsize=(8,4))
sns.distplot(df.age, color='r')
plt.show()

### Most of the people are under the age of the 20 and the maximum age is 64

In [None]:
## bmi distribution

plt.figure(figsize=(8,4))
sns.distplot(df.bmi)
plt.show()

### bmi Distribution 

In [None]:
plt.figure(figsize=(8,8))
sns.jointplot(x = df.bmi, y = df.charges, hue = df.children)
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.jointplot(x = df.bmi, y = df.charges, hue = df.smoker)
plt.show()

### As we can see from above graph that majority of the people have bmi in between 25 to 35 and people who smokes have more medical costage

In [None]:
plt.figure(figsize=(8,6))
sns.scatterplot(x = df.bmi, y=df.charges, hue = df.smoker)
plt.show()

### As we can see from above graph that majority of the people have bmi in between 25 to 35 and people who smokes have more medical costage

In [None]:
## conversion of the categorical data into continuous data

## method 1
## using LabelEncoding
for col in cat_var:
    encoder = preprocessing.LabelEncoder()
    encoder.fit(df[col])
    df[col] = encoder.transform(df[col])
    

In [None]:
df.describe()

In [None]:
## plotting corelation matrix

plt.figure(figsize=(8,6))

sns.heatmap(df.corr(), annot = True)
plt.show()

### As we can see that charges are majorly dependent over the smoking habits of the People

In [None]:
## Model Seletion

x = df.drop(['charges'],axis=1)
y = df.charges

x_train, x_test, y_train, y_test = model_selection.train_test_split(x,y, random_state = 7)

In [None]:
scaler = preprocessing.MinMaxScaler()

scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
reg = linear_model.LinearRegression()
reg.fit(x_train, y_train)

y_pred = reg.predict(x_test)

print("Score of the regressor is",reg.score(x_test, y_test))
print("Mean Square Error is", metrics.mean_squared_error(y_test, y_pred))
print("Mean Absolute Error is", metrics.mean_absolute_error(y_test, y_pred))


In [None]:
reg = ensemble.RandomForestRegressor()
reg.fit(x_train, y_train)

y_pred = reg.predict(x_test)

print("Score of the regressor is",reg.score(x_test, y_test))
print("Mean Square Error is", metrics.mean_squared_error(y_test, y_pred))
print("Mean Absolute Error is", metrics.mean_absolute_error(y_test, y_pred))


In [None]:
reg = tree.DecisionTreeRegressor()
reg.fit(x_train, y_train)

y_pred = reg.predict(x_test)

print("Score of the regressor is",reg.score(x_test, y_test))
print("Mean Square Error is", metrics.mean_squared_error(y_test, y_pred))
print("Mean Absolute Error is", metrics.mean_absolute_error(y_test, y_pred))


In [None]:
reg = svm.SVR()
reg.fit(x_train, y_train)

y_pred = reg.predict(x_test)

print("Score of the regressor is",reg.score(x_test, y_test))
print("Mean Square Error is", metrics.mean_squared_error(y_test, y_pred))
print("Mean Absolute Error is", metrics.mean_absolute_error(y_test, y_pred))


Thankyou for reading!!