### Import dataset and libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [None]:
df=pd.read_csv('/kaggle/input/insurance-prediction/insurance.csv')
df.head()

In [None]:
df.isnull().sum()

In [None]:
df.describe()

### Feature selection

**Dataset doesnt have nan values. There are only numerical data, so correlation should be checked. The correlation matrix shows that age, bmi and smoker have a relationship with charges which is our independent variable**

In [None]:
import seaborn as sns
f, ax = plt.subplots(figsize=(10, 7))

corr = df.corr()
sns.heatmap(corr, cmap="Blues", annot=True)

**Exploatry analysis on dataset**

In [None]:
plt.figure(figsize=(10, 5))

# grouped barplot
ax = sns.barplot(
    x="smoker", 
    y="charges", 
    data=df, 
    ci=None,
    palette=["cornflowerblue", "khaki"]
    )
ax.set_xticklabels(['Non smoker','Smoker'],fontdict= { 'fontsize': 10, 'fontweight':'bold'})
# Customize the axes and title


In [None]:
bmi=df.iloc[:,2].values
bmi=sorted(bmi)

charges=df.iloc[:,-1].values
charges=sorted(charges)
fontdict_labels= { 'fontsize': 10, 'fontweight':'bold'}
fontdict_title= { 'fontsize': 20, 'fontweight':'bold'}


fig, axs = plt.subplots(figsize=(10, 5))
plt.plot(bmi, charges ,'o')
plt.title('Simple scatter plot of BMI and insurance cost',fontdict_title)


age_bmi = pd.cut(df['age'], 5)
bmi=pd.cut(df['bmi'],3)
fig, axs = plt.subplots(figsize=(10, 5))
sns.countplot(x=age_bmi,hue=bmi, 
              data=df,palette=["cornflowerblue", "khaki","black"]).set_title("Age distrubation and BMI",
                                                                fontdict_title);


age_charge = pd.cut(df['age'], 4)
charges=pd.cut(df['charges'],3)
fig, axs = plt.subplots(figsize=(10, 5))
sns.countplot(x=age_charge,hue=charges, 
              data=df,palette=["cornflowerblue", "khaki","black"]).set_title("Age distrubation and insurance charge",
                                                                fontdict_title);
sns.despine()


### Model selection

**Split data to training and testing data**

In [None]:
X = df[['age','bmi','smoker']]
y=df[['charges']]

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

**Scaling the data**

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

**Randomforest with k-fold**

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
y_train=np.ravel(y_train)
regressor = RandomForestRegressor(n_estimators = 100, random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
print("R2: ",round(r2_score(y_test, y_pred),2))

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("\nAccuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

**Decision tree**

In [None]:
from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state = 0)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)
np.set_printoptions(precision=2)
print("R2: ",round(r2_score(y_test, y_pred),2))

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("\nAccuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

**Polynomial regression**

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
poly_reg = PolynomialFeatures(degree = 2)
X_poly = poly_reg.fit_transform(X_train)
regressor = LinearRegression()
regressor.fit(X_poly, y_train)

y_pred = regressor.predict(poly_reg.transform(X_test))
print("R2: ",round(r2_score(y_test, y_pred),2))

from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = regressor, X = X_train, y = y_train, cv = 10)
print("\nAccuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

**Randomforest and polynomial regression have a better accurecy when crossvalidated with k-fold. Decisiontree also have a good accurecy, but the standard deviation is high**