# Predicting insurance charges

## Import packages

In [None]:
##Importing the packages
#Data processing packages
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#Machine Learning packages
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier, LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import confusion_matrix
from sklearn.metrics import explained_variance_score
from sklearn.metrics import mean_absolute_error
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

#Suppress warnings
import warnings
warnings.filterwarnings('ignore')

## Import data

**Import the file 'insurance.csv'**

In [None]:
data = pd.read_csv('../input/insurance-charges/insurance.csv')
data.head()

**Display Information (info) about the 'data'**

In [None]:
data.info()

<font color='blue'>**COMMENTS :** There is no missing data (all the columns have 10,000 rows)</font>

**Display basic statistics (describe) of numerical columns in the 'data'**

In [None]:
data.describe()

**Display basic statistics (describe) of categorical columns in the 'data'**

In [None]:
data.select_dtypes(include=['object']).describe(include='all')

<font color='blue'>**COMMENTS :** The categorical columns shown above needs to be converted to numerical columns</font>

## Convert categorical columns to numerical

**Convert all the categorical fields of 'data' to numerical fields**

In [None]:
data = pd.get_dummies(data)

In [None]:
data.head()

<font color='blue'>**COMMENTS :** It can be observed additional columns are formed after converting from categorical to numerical columns</font>

In [None]:
data.shape

<font color='blue'>**COMMENTS :** It can be observed that original 19 columns have now become 581 columns after converting from categorical to numerical columns</font>

## Create feature and target set

**Remove 'charges' column from Feature set(X) and create Target set(y) with 'charges' column**

In [None]:
X = data.drop(['charges'], axis=1)
y = data['charges']

In [None]:
X.head()

## Scaling the data values to standardize the range of independent variables

In [None]:
#Feature scaling is a method used to standardize the range of independent variables or features of data.
#Since the range of values of raw data varies widely, in some machine learning algorithms, objective functions will not work properly without normalization. 
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X = scale.fit_transform(X)

In [None]:
print(X)

<font color='blue'>**COMMENTS :** It can be observed that range of values is normalized</font>

## Split the data into "train" and "test" set

**Split the Feature set (X) and Target set (y) into training set (X_train, y_train) and testing set (X_test,y_test)**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size =0.2,random_state=42)

In [None]:
print('Training set',X_train.shape,y_train.shape)
print('Testing set',X_test.shape,y_test.shape)

<font color='blue'>**COMMENTS :** Total data (10,000 rows) is divided into Training set(8,000 rows) and Testing set (2,000 rows</font>

## Predicting the Airbnb prices using Linear Regression ML model

**Create the 'LinearRegression' model**

In [None]:
clf = LinearRegression() 

**Train the model using training set (X_train, y_train)**

In [None]:
clf.fit(X_train, y_train)

**Input the Test Feature set (X_test) and predict the Target values (y_pred)**

In [None]:
y_pred = clf.predict(X_test)

**Lineplot : Reference line : Testing (y_test) vs Testing (y_test) <br>
Scatterplot : Prediction points : Testing (y_test) vs Prediction (y_pred)**

In [None]:
plt.figure()
plt.plot(y_test, y_test)
plt.scatter(y_test, y_pred, s=10, c="red")
plt.title("Actual vs Predicted values")
plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.show()

<font color='blue'>**COMMENTS :** The model is not working as per requirement.</font>

**Calculate the Model Score**

In [None]:
clf_score = clf.score(X_test,y_test)
pred_clf = clf.predict(X_test)
print("Multiple Linear Regression Model Score is : ", round(clf_score*100))

<font color='blue'>**COMMENTS :** Model score is not meeting the requirement</font>

## Predicting the Airbnb prices using Decision Tree Regression ML model

**Create the 'DecisionTreeRegressor' model**

In [None]:
clf = DecisionTreeRegressor(random_state=0, max_depth=17)

**Train the model using training set (X_train, y_train)**

In [None]:
clf.fit(X_train, y_train)

**Input the Test Feature set (X_test) and predict the Target values (y_pred)**

In [None]:
y_pred = clf.predict(X_test)

**Lineplot : Reference line : Testing (y_test) vs Testing (y_test) <br>
Scatterplot : Prediction points : Testing (y_test) vs Prediction (y_pred)**

In [None]:
plt.figure()
plt.plot(y_test, y_test)
plt.scatter(y_test, y_pred, s=10, c="red")
plt.title("Actual vs Predicted values")
plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.show()

<font color='blue'>**COMMENTS :** Above figure denotes the relationship between Actual values and Predicted values</font>

**Calculate the Model Score**

In [None]:
clf_score = clf.score(X_test,y_test)
pred_clf = clf.predict(X_test)
print("Decision tree  Regression Model Score is : ", round(clf_score*100))

<font color='blue'>**COMMENTS :** Model score is average</font>

## Parameter Tuning : Optimizing the Decision Tree Regressor ML model

In [None]:
import matplotlib.pyplot as plot

maeList = []
depthList = []

for depths in range(1, 19):
    dtr_model = DecisionTreeRegressor(random_state=0, max_depth=depths)
    dtr_model.fit(X_train, y_train)
    preds = dtr_model.predict(X_test)
    this_mae = mean_absolute_error(y_test, preds)
    maeList.append(this_mae)
    depthList.append(depths)

plot.figure()
plot.plot(depthList, maeList)
plot.title("Mean Absolute Error for different max depth values")
plot.xlabel("Depth")
plot.ylabel("MAE")
plot.show()

<font color='blue'>**COMMENTS :** It can observed in the above MAE vs depth plot that MAE is minimum around Depth of 7.5</font>

## Predicting the Airbnb prices using Random Forest Regression ML model

**Model = RandomForestRegressor <br>
Train the model using : X_train  and y_train <br>
Test the Model (Predict) using : X_test <br>
Plot the graph of Actual vs Predicted results : y_test, y_pred**

In [None]:
#Create a Random Forest Regression Model
clf = RandomForestRegressor(random_state=0)

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

plt.figure()
plt.plot(y_test, y_test)
plt.scatter(y_test, y_pred, s=10, c="red")
plt.title("Actual vs Predicted values")
plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.show()

#Calculate teh Model Score and Variance Score
clf_score = clf.score(X_test,y_test)
pred_clf = clf.predict(X_test)
#expl_clf = explained_variance_score(pred_clf,y_test)
print("Random Forest Regression Model Score is : ", round(clf_score*100))

In [None]:
from sklearn.svm import SVR
#Create a SVR Regression Model
clf =SVR(kernel='rbf',epsilon=1.0) #rbf = Radial basis function
#clf =SVR(kernel='linear',epsilon=1.0)

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

plt.figure()
plt.plot(y_test, y_test)
plt.scatter(y_test, y_pred, s=10, c="red")
plt.title("Actual vs Predicted values")
plt.xlabel("Actual values")
plt.ylabel("Predicted values")
plt.show()

#Calculate the Model Score and Variance Score
clf_score = clf.score(X_test,y_test)
pred_clf = clf.predict(X_test)
#expl_clf = explained_variance_score(pred_clf,y_test)
print("SVR Model Score is : ", round(clf_score*100))