In [None]:


import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))



In [None]:
df = pd.read_csv('/kaggle/input/insurance/insurance.csv')

In [None]:
df.head()

# exploratory data analysis :

In [None]:
df.describe().transpose()

In [None]:
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline

In [None]:
sns.distplot(df['age'],bins=30)

In [None]:
sns.jointplot(df['age'],df['charges'])

In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x=df['age'],y=df['charges'],hue=df['region'])

In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x=df['age'],y=df['charges'],hue=df['smoker'])

*smokers seem to have higher charges for any age, hence it needs to be converted from a categorical to numerical variable(binary-encoding { 1: yes, 0: no})*

In [None]:
sns.countplot(df['children'])

In [None]:
plt.figure(figsize=(14,8))
sns.scatterplot(x=df['age'],y=df['charges'],hue=df['children'])

*if the family size is large, the chances of contracting an ailment increases, hence the expenditure rises, so we have to include the number of children as one of the factors !*

In [None]:
plt.figure(figsize=(14,8))
sns.kdeplot(df['bmi'],shade=True)

*the distribution of the BMI in the population seems to be centred around 30, in a gaussian normal distribution, very few people are at the extreme sides !*

*seeing the data, it seems clear that age, number of children, BMI, smoker or not are closely linked with the insurance column so our aim is whether to or not one-hot-encode the region column ? it may be that people in some region of the state tend to fall ill due to poor conditions and facilities maybe ?*

*proposed solution, we will try to use random forests also polynomial regression to study the data once with the region column and once without !*

# building up the model and preprocessing of the data !

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df[df.columns[:len(df.columns)-1]]
y = df[df.columns[-1]]

In [None]:
def one_hot_encoder_1(x):
    if x == 'yes':
        return 1
    else:
        return 0

In [None]:
def one_hot_encoder_2(x):
    if x == 'male':
        return 1
    else:
        return 0

In [None]:
df['region'].unique()

In [None]:
X['smoker'] = X['smoker'].apply(one_hot_encoder_1)

In [None]:
X['sex'] = X['sex'].apply(one_hot_encoder_2)

In [None]:
X.head()

In [None]:
X=pd.concat([X,pd.get_dummies(X['region'],drop_first=True)],axis=1)

In [None]:
X = X.drop('region',axis=1)

In [None]:
X.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=101)

In [None]:
scaled_train = scaler.fit_transform(X_train)
scaled_test = scaler.transform(X_test)

# using linear regression :

In [None]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()

**how does linear regression work ?**

* well if we have data having features ${x_{1}, x_{2} ......, x_{n}}$ ,we model it as a vector $ \vec{x} = <x_{1},x_{2}....,x_{n}>$ , and fit the data into a hypothesis function $f(\,\vec{x}) = \theta^{T}\,. \vec{x} +\, b $ and then fine tune for the best fit ! 

In [None]:
lr_model.fit(scaled_train,y_train)

In [None]:
preds = lr_model.predict(scaled_test)

In [None]:
lr_model.coef_

In [None]:
lr_model.intercept_

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
print(mean_squared_error(y_test,preds)**0.5)

In [None]:
print(mean_absolute_error(y_test,preds))

In [None]:
print(r2_score(y_test,preds))

# Using Polynomial Regression :

In [None]:
from sklearn.preprocessing import PolynomialFeatures

In [None]:
poly_transformer = PolynomialFeatures(3)

In [None]:
X_poly_train,X_poly_test,y_train,y_test = train_test_split(X,y,test_size=0.25,random_state=101)

In [None]:
X_poly_train = poly_transformer.fit_transform(X_poly_train)
X_poly_test = poly_transformer.transform(X_poly_test)

In [None]:
poly_model = LinearRegression()

In [None]:
poly_model.fit(X_poly_train,y_train)

In [None]:
pred = poly_model.predict(X_poly_test)

In [None]:
poly_model.coef_

In [None]:
poly_model.intercept_

In [None]:
print(mean_squared_error(y_test,preds)**0.5)

In [None]:
print(mean_absolute_error(y_test,preds))

In [None]:
print(r2_score(y_test,preds))

**how does polynomial regression work ?**
* *it is very similar to the linear regression, we just aim to introduce higher degree interaction terms in the data set ! as you know sometimes the presence of two or more features simultaneously has a greater effect than them alone, hence adding interaction terms (of higher degrees, by default degree = 2) helps us fit better !*

# Using Random Forests :

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
losses = []
i_vals = []
for i in range(1,70):
    decision_forest = RandomForestRegressor(n_estimators = i)
    
    decision_forest.fit(scaled_train,y_train)
    
    pred = decision_forest.predict(scaled_test)
    
    i_vals.append(i)
    
    losses.append(mean_squared_error(y_test,pred)**0.5)
    
    

In [None]:
plt.figure(figsize=(10,8))
plt.plot(i_vals,losses)

In [None]:
decision_forest = RandomForestRegressor(n_estimators = 50)

decision_forest.fit(scaled_train,y_train)

pred = decision_forest.predict(scaled_test)

In [None]:
print(mean_squared_error(y_test,pred)**0.5)

In [None]:
print(mean_absolute_error(y_test,pred))

In [None]:
decision_forest.feature_importances_

**using the above line of code, you can see the related importance of the features ! hence i avoid further discussion you can explore which to include and which to avoided !**

**conclusion : the region column is of importance hence it cannot be ignored blatently !**