## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('../input/insurance/insurance.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

In [None]:
dataset.hist()

##Checking Duplicates

In [None]:
dataset.duplicated().sum()

In [None]:
dataset.drop_duplicates(inplace=True)
dataset.duplicated().sum()

In [None]:
plt.scatter(dataset['age'], dataset['charges'])
plt.title('Bmi vs Charges')
plt.xlabel('BMI')
plt.ylabel('Charges')
plt.show()

##Encoding Data

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), [1,4,5])], remainder='passthrough')
X = np.array(ct.fit_transform(X))

In [None]:
X

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

## Multiple Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X_train, y_train)

In [None]:
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
from sklearn.metrics import r2_score
mlr = r2_score(y_test, y_pred)
mlr

In [None]:
plt.scatter(y_test, y_pred)

##Polynomial regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X_train)
regressor_p = LinearRegression()
regressor_p.fit(X_poly, y_train)

In [None]:
y_pred_p = regressor_p.predict(poly_reg.transform(X_test))
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_p.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
pr = r2_score(y_test, y_pred_p)
pr

In [None]:
plt.scatter(y_test, y_pred_p)

##Descision tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor_dt = DecisionTreeRegressor(random_state = 0)
regressor_dt.fit(X_train, y_train)

In [None]:
y_pred_dt = regressor_dt.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_dt.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1))

In [None]:
dt = r2_score(y_test, y_pred_dt)
dt

In [None]:
plt.scatter(y_test, y_pred_dt)

##Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor_rf = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor_rf.fit(X_train, y_train)

In [None]:
y_pred_rf = regressor_rf.predict(X_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred_rf.reshape(len(y_pred_rf),1), y_test.reshape(len(y_test),1)),1))

In [None]:
rf = r2_score(y_test, y_pred_rf)
rf

In [None]:
plt.scatter(y_test, y_pred_rf)

##Comparing R-squared values

In [None]:
x_a = ['Multiple linear regression', 'Polynomial regression', 'Descision tree', 'Random forest']
y_a = [mlr,pr,dt,rf]
plt.bar(x_a, y_a, width = 0.5, color="#ff6666")
plt.title('R squared of the taken models')
plt.xlabel('Models')
plt.ylabel('R squared value')
plt.show()