In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("/kaggle/input/insurance/insurance.csv")
df.head()

## DATA EXPLORATION

In [None]:
df.isnull().sum()

There is no nan value which is good.

In [None]:
df.shape

There are 1338 accounts in seven different columns. 

In this project what we are trying to do is predicting the charges value. Since charges value is a continous variable, the name of process is **regression**. First we need to handle the data after that we are applying some regression techniques.

In [None]:
df.describe()

There are a few things that comes to attention.
1. Average age is 39.
2. Average BMi is 30, which is the threshold for the obesity. [Source](https://www.cdc.gov/obesity/adult/defining.html#:~:text=If%20your%20BMI%20is%20less,falls%20within%20the%20obese%20range.)
3. Average children is 1. 

Let's see some statistics about the categorical variables such as region and gender

In [None]:
df.groupby('region')['charges'].agg(['min', 'max', 'mean'])

From the numbers what we can say is *southeast* have highest average but there is not much difference in those regions. Let's see a boxplot of it.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="region", y="charges", data=df)
plt.title("Box plot of the Regions")
plt.show()

Apperantly there are some higher charges in southeast but average looks almost equal in 4 regions.

Now let's see for the gender.

In [None]:
df.groupby('sex')['charges'].agg(['min', 'max', 'mean', 'std'])

Mean value is higher in Male gender. It could be a useful feature for us. Let's see a boxplot.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="sex", y="charges", data=df)
plt.title("Box plot of the Genders")
plt.show()

We can combine gender and sex and see the values for in each region with 2 different gender.

In [None]:
df.groupby(['region', 'sex'])['charges'].agg(['min', 'max', 'mean', 'std', 'count'])

The number of people in each region with each gender is almost same. However, Male gender in southeast region has the highest mean value among all the numbers. There is at least 3k difference between southeast male and other regions. 

Other than that in Northwest region male mean value is lower than the female mean value, unlike other regions. 

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="region", y="charges", hue="sex", data=df)
plt.title("Box plot of the Regions with each gender")
plt.show()

Let's examine the smoking scores.

In [None]:
df.groupby('smoker')['charges'].agg(['min', 'max', 'mean', 'std', 'count'])

Apperantly, smokers charges are more expensive than the non-smoker charges. Let's see a plot of it.

In [None]:
plt.figure(figsize=(10, 6))
sns.countplot(x="smoker", data=df)
plt.title("Smoker Numbers")
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="smoker", y="charges", data=df)
plt.title("Box plot of the Smokers")
plt.show()

Smoking will be a crucial variable in the regression part.

Let's see the count of the smokers in each region

In [None]:
df.groupby(['region', 'smoker'])['charges'].agg(['min', 'max', 'mean', 'std', 'count'])

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="region", y="charges", hue="smoker", data=df)
plt.title("Box plot of the Regions")
plt.show()

Southeast has more smokers than the other regions. Smoker numbers might be the reason why southeast has the highest average of charges.

Now let's examine the children factor

In [None]:
df['children'].describe()

In [None]:
df['children'].value_counts()

Nearly half of the families has no children and the children average is 1.20. 

In [None]:
plt.figure(figsize=(10, 6))
sns.distplot(df['children'])
plt.title("Distribution of Children")
plt.show()

In [None]:
df.groupby('children')['charges'].agg(['min', 'max', 'mean', 'std', 'count'])

Apperantly highest average value is in families with 2 or 3 children.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="children", y="charges", data=df)
plt.title("Box plot of the Regions")
plt.show()

Now let's see the Age values.

In [None]:
plt.figure(figsize=(10, 6))
sns.distplot(df['age'])
plt.title("Distribution of Age")
plt.show()

In [None]:
df.age.describe()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x="age", y="charges", data=df)
plt.title("Box plot of the Genders based on Age")
plt.show()

It looks like there is a correlation in between age and charges.

Let's see the gender age values.

In [None]:
df.groupby(['sex'])['age'].agg(['min', 'max', 'mean', 'count'])

The mean values are basicaly the same. Let's see of smokers age values.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="sex", y="age", data=df)
plt.title("Box plot of the Genders based on Age")
plt.show()

Let's check the bmi

In [None]:
df['bmi'].describe()

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x="bmi", y="charges", data=df)
plt.title("Box plot of the Regions")
plt.show()

Let's try something else. Let's divide bmi with obes and not obes

In [None]:
df['obesite'] = df['bmi'] > 30
df['obesite'].value_counts()

Nearly half of the people are obes

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(x="obesite", y="charges", data=df)
plt.title("Box plot of the Obesite based on charges")
plt.show()

Obesite ones charges are higher than non-obesites.

In [None]:
df.groupby('obesite')['charges'].agg(['min', 'max', 'mean', 'count'])

Let's see obesite values in each gender

In [None]:
df.groupby(['obesite', 'sex'])['charges'].agg(['min', 'max', 'mean', 'count'])

Male and Female Obesite counts are nearly same. Obesite could be an important factor. 

What we learned so far
* Smoker is a cruical factor of insurance charges.
* There is not much difference between regions.
* Generally males charges are more than female charges.
* 2 or 3 children families has the most charges.
* Obesite plays a crucial factor.

In [None]:
corr = df.corr()
ax = sns.heatmap(corr, annot=True)

## DATA PREPROCESSING

In [None]:
from sklearn.preprocessing import LabelEncoder

columns = ['sex', 'region', 'smoker', 'obesite']

for column in columns:
    encoder = LabelEncoder()
    df[column] = encoder.fit_transform(df[column])

df.head()

Let's divide our features and labels

In [None]:
X = df.drop(columns=['charges', 'bmi'])
y = df['charges']

print("X's shape", X.shape)
print("y's shape", y.shape)

Let's standardize our data.

In [None]:
from sklearn.preprocessing import scale

columns = X.columns

for column in columns:
    print(column)
    X[column] = scale(X[column])
    
X.head()
    

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("There are {} training examples".format(X_train.shape[0]))
print("There are {} test examples".format(X_test.shape[0]))

### MODEL TRAINING

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(X_train, y_train)

predictions = model.predict(X_test)

Let's see some stats

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error


r2 = r2_score(y_test, predictions)
mse = mean_squared_error(y_test, predictions)

print("R2 score is {}".format(r2))
print("Mean Squared Error score is {}".format(mse))

R2 score is 0.78 which means there is a strong correlation between what we predicted and actual values. Let's see the plot

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x=predictions, y=y_test)
plt.title("Scatter plot of Predictions and Actual Values")
plt.show()

If you find these kernel useful or any good, please upvote. I will try to improve the model performance in the next days. 

Stay safe.