In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings(action='ignore')
%matplotlib inline

In [None]:
df=pd.read_csv('../input/insurance/insurance.csv')
df.head()

In [None]:
df.shape

In [None]:
#checking for any symbols 

In [None]:
df[df.applymap(np.isreal).all(1)]

In [None]:
#checking for null values

In [None]:
df.isna().sum()

In [None]:
#Data Analysis

In [None]:
df['sex'].value_counts()

In [None]:
df['smoker'].value_counts()

In [None]:
df['children'].value_counts()

In [None]:
df['region'].value_counts()

In [None]:
df.groupby('sex').count()

In [None]:
df.groupby('sex').count()*100/df['sex'].count()   #the percentage of gender is almost same

In [None]:
df.groupby('smoker').count()        #there are more non-smokers in dataset than smokers

In [None]:
df.groupby('region').count()       #region data is almost uniform

In [None]:
df.groupby('children').count()       #More the children more chances r there for being a non-smoker

In [None]:
#visualization

In [None]:
sns.countplot(df['sex'])

In [None]:
sns.countplot(df['sex'],hue=df['smoker'])  #there r more male smokers  than female ones

In [None]:
plt.figure(figsize=(15,7))   #data for age 18-19 is more and for rest ages data is uniform
sns.countplot(df['age']) 

In [None]:
sns.countplot(df['region'])


In [None]:
sns.countplot(df['region'],hue=df['smoker'])    #there are more smokers in southeast regions

In [None]:
sns.countplot(df['region'],hue=df['sex'])     #male,female ratio is same almost in all regions

In [None]:
sns.lineplot(df['age'],df['bmi'])          #its showing a slight positive corr between both of them
plt.show()

In [None]:
sns.lineplot(df['age'],df['charges'])
plt.show()         #its shows they both are positively correlated

In [None]:
sns.lineplot(df['bmi'],df['charges'])
plt.show()         #it doesnot show any relation

In [None]:
sns.violinplot(df['sex'],df['bmi'])  #for bmi sex data is a bit normally distributed
plt.axhline(df[df['sex']=='female']['bmi'].mean())
plt.axhline(df[df['sex']=='male']['bmi'].mean())

In [None]:
sns.violinplot(df['sex'],df['age'])  #for age data is platykurtic
plt.axhline(df[df['sex']=='female']['age'].mean())
plt.axhline(df[df['sex']=='male']['age'].mean())

In [None]:
sns.violinplot(df['smoker'],df['age'])   #here data is platykurtic
plt.axhline(df[df['smoker']=='yes']['age'].mean())
plt.axhline(df[df['smoker']=='no']['age'].mean())

In [None]:
sns.violinplot(df['smoker'],df['bmi'])   #here data is normally distributed
plt.axhline(df[df['smoker']=='yes']['bmi'].mean())
plt.axhline(df[df['smoker']=='no']['bmi'].mean())

In [None]:
sns.violinplot(df['smoker'],df['charges'])   #for smoker charges data is left skewed and non-smoker data is right skewed a bit
plt.axhline(df[df['smoker']=='yes']['charges'].mean())
plt.axhline(df[df['smoker']=='no']['charges'].mean())

In [None]:
sns.violinplot(df['sex'],df['charges'])  #for sex,charges data is rightly skewed
plt.axhline(df[df['sex']=='female']['charges'].mean())
plt.axhline(df[df['sex']=='male']['charges'].mean())

In [None]:
#Inferences of vizualization:
#Ratio of male and female is almost same
#regionwise distribution is also almost same
#as no.of children increases non-smokers increases
#as age increases chances of bmi increase also increases
#as no.of children increases charges decreases
#as age increases, charges also increases
#more people of age 18-19 are there
#for age and smoker,charges data is skewed here


In [None]:
#checking for outliers

In [None]:
df.plot.box(figsize=(15,10))

In [None]:
df['bmi'].plot.box()

In [None]:
#Flooring and Capping for outlier treatment

In [None]:
q1=df['bmi'].quantile(0.25)
q3=df['bmi'].quantile(0.75)
iqr=q3-q1
ll=q1-1.5*iqr
ul=q3+1.5*iqr
df['bmi']=np.where((df['bmi']>ul),ul,df['bmi'])

In [None]:
df['bmi'].plot.box()

In [None]:
df.dtypes

In [None]:
#Doing OnehotEncoding

In [None]:
df=pd.get_dummies(data=df,columns=['sex','smoker','region'],drop_first=True)

In [None]:
df.head()

In [None]:
#checking correlation between independent and dependent variables

In [None]:
df.corr()

In [None]:
sns.pairplot(df)

In [None]:
#starting maodelling

In [None]:
X=df.drop('charges',axis=1)
y=df['charges']

In [None]:
xtrain,xtest,ytrain,ytest=train_test_split(X,y,train_size=0.75,random_state=1)

In [None]:
#doing Scaling

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler('box-cox')
xtrain[['age','bmi']]=ss.fit_transform(xtrain[['age','bmi']])
xtest[['age','bmi']]=ss.transform(xtest[['age','bmi']])

In [None]:
xtrain.head()

In [None]:
from sklearn.linear_model import LinearRegression
lr=LinearRegression()
lr.fit(xtrain,ytrain)
lr.score(xtest,ytest)

In [None]:
#checking highest score for linear regression

In [None]:
from sklearn.model_selection import cross_val_score
cross=cross_val_score(estimator=lr,X=X,y=y,scoring='r2',cv=10,n_jobs=-1)

In [None]:
cross

In [None]:
cross.mean()

In [None]:
# choosing best estimators for different models

In [None]:
from sklearn.linear_model import Lasso,Ridge,ElasticNet
lasso=Lasso()
ridge=Ridge()
en=ElasticNet()

In [None]:
from sklearn.model_selection import GridSearchCV
param=[{'alpha':[1,0.8,0.5,0.1,0.01,0.001]}]
grid=GridSearchCV(estimator=lasso,param_grid=param,scoring='r2',n_jobs=-1,cv=10)
grid.fit(xtrain,ytrain) 
grid.score(xtest,ytest)

In [None]:
from sklearn.model_selection import GridSearchCV
param=[{'alpha':[1,0.8,0.5,0.1,0.01,0.001]}]
grid=GridSearchCV(estimator=ridge,param_grid=param,scoring='r2',n_jobs=-1,cv=10)
grid.fit(xtrain,ytrain) 
grid.score(xtest,ytest)

In [None]:
from sklearn.model_selection import GridSearchCV
param=[{'alpha':[1,0.8,0.5,0.1,0.01,0.001],'l1_ratio':[0.5,0.8,1,0.1,0.01,0.6]}]
grid=GridSearchCV(estimator=en,param_grid=param,scoring='r2',n_jobs=-1,cv=10)

In [None]:
grid.fit(xtrain,ytrain)  

In [None]:
grid.best_estimator_

In [None]:
grid.score(xtest,ytest)

In [None]:
#doing feature selection

In [None]:
#forward selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector
sfs=SequentialFeatureSelector(estimator=en,k_features=4,forward=True,cv=3,n_jobs=-1)
sfs.fit(xtrain,ytrain)

In [None]:
sfs.k_score_

In [None]:
#backward selection

In [None]:
from mlxtend.feature_selection import SequentialFeatureSelector
sfs=SequentialFeatureSelector(estimator=en,k_features=4,forward=False,cv=3,n_jobs=-1)
sfs.fit(xtrain,ytrain)

In [None]:
sfs.k_score_

In [None]:
from sklearn.svm import SVR
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,mean_squared_error,r2_score
svm=SVR(kernel='linear')
svm.fit(xtrain,ytrain)
svm.score(xtest,ytest)
ypred=svm.predict(xtest)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rfr=RandomForestRegressor(n_estimators=100,criterion='mse',max_depth=5,random_state=7,oob_score=False)
rfr.fit(xtrain,ytrain)
rfr.score(xtest,ytest)

In [None]:
cross=cross_val_score(estimator=rfr,X=X,y=y,cv=10,n_jobs=-1,scoring='r2')

In [None]:
cross

In [None]:
cross.mean()