In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df = pd.read_csv("insurance.csv")

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
df.shape

(1338, 7)

In [6]:
df.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
df.duplicated().sum()

1

In [8]:
df.drop_duplicates(inplace=True)

In [9]:
df.duplicated().sum()

0

In [10]:
df['age'][df['age'] == 0].count()

0

In [11]:
df['sex'].unique()

array(['female', 'male'], dtype=object)

In [12]:
df['bmi'][df['bmi'] == 0].count()

0

In [13]:
df['children'].unique()

array([0, 1, 3, 2, 5, 4], dtype=int64)

In [14]:
df['smoker'].unique()

array(['yes', 'no'], dtype=object)

In [15]:
df['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [16]:
df['charges'][df['charges'] == 0].count()

0

In [17]:
gender = pd.get_dummies(df['sex'], prefix=['gender'], drop_first=True)

In [18]:
gender

Unnamed: 0,['gender']_male
0,0
1,1
2,1
3,1
4,1
...,...
1333,1
1334,0
1335,0
1336,0


In [19]:
smoker = pd.get_dummies(df['smoker'], prefix='smoker', drop_first=True)

In [20]:
smoker

Unnamed: 0,smoker_yes
0,1
1,0
2,0
3,0
4,0
...,...
1333,0
1334,0
1335,0
1336,0


In [21]:
region = pd.get_dummies(df.region, prefix='region', drop_first=False)

In [22]:
region

Unnamed: 0,region_northeast,region_northwest,region_southeast,region_southwest
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
1333,0,1,0,0
1334,1,0,0,0
1335,0,0,1,0
1336,0,0,0,1


In [23]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [24]:
df = df[['age', 'bmi', 'children', 'charges']]

In [None]:
df.head()

In [None]:
df = pd.concat((df, gender), axis=1)

In [None]:
df.head()

In [None]:
df = pd.concat((df, region), axis=1)

In [None]:
df.head()

In [None]:
df = pd.concat((df, smoker), axis=1)

In [None]:
df.head()

In [None]:
df.dtypes

In [None]:
df.describe()

In [None]:
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(df.corr(), annot=True, fmt='.2g', cmap='viridis', cbar=True)

In [None]:
plt.style.use('seaborn')
fig, ax = plt.subplots(figsize=(5, 5))
sns.histplot(x = df.charges, color='purple', kde=True)

In [None]:
plt.style.use('seaborn')
fig, ax = plt.subplots(figsize=(5, 5))
sns.histplot(x = df.age, color='purple', kde=True)

In [None]:
plt.style.use('seaborn')
fig, ax = plt.subplots(figsize=(5, 5))
sns.histplot(x = df.bmi, color='purple', kde=True)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
X = df.drop('charges', axis=1)
y = df['charges']

In [None]:
X.head()

In [None]:
y.head()

In [None]:
sc = StandardScaler()

In [None]:
x_scale = sc.fit_transform(X)
x_scale

In [None]:
x_scale[:]

In [None]:
plt.style.use('seaborn')
fig, ax = plt.subplots(figsize=(5, 5))
sns.histplot(x = x_scale[1], color='purple', kde=True)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_scale, y, test_size=0.25, random_state=322)

In [None]:
liner_model = LinearRegression()

In [None]:
liner_model.fit(x_train,y_train)

In [None]:
liner_model.score(x_test, y_test)

In [None]:
liner_model.score(x_train,y_train)

In [None]:
prediction = liner_model.predict(x_test)

In [None]:
sns.distplot(y_test-prediction)

In [None]:
plt.scatter(y_test, prediction)

In [None]:
from sklearn import metrics

In [None]:
"MAE: {}".format(metrics.mean_absolute_error(y_test, prediction))

In [None]:
"MAE: {}".format(metrics.mean_squared_error(y_test, prediction))

In [None]:
"MAE: {}".format(np.sqrt(metrics.mean_squared_error(y_test, prediction)))

In [None]:
import pickle

In [None]:
file = open('liner_model.pkl', 'wb')
pickle.dump(liner_model, file)