In [38]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [39]:
insurance = pd.read_csv('insurance.csv')

In [40]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [41]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null object
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null object
region      1338 non-null object
charges     1338 non-null float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.2+ KB


In [42]:
insurance.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [43]:
gender = np.unique(insurance['sex'])
gender

array(['female', 'male'], dtype=object)

In [44]:
from sklearn.preprocessing import LabelEncoder
genl = LabelEncoder()
sex_labels = genl.fit_transform(insurance['sex'])
sex_mappings = {index: label for index, label in 
                  enumerate(genl.classes_)}
sex_mappings

{0: 'female', 1: 'male'}

In [45]:
insurance['sex'] = sex_labels

In [46]:
smoke = np.unique(insurance['smoker'])
smoke

array(['no', 'yes'], dtype=object)

In [47]:
skl = LabelEncoder()
smoke_labels = skl.fit_transform(insurance['smoker'])
smoke_mappings = {index: label for index, label in enumerate (skl.classes_)}
smoke_mappings

{0: 'no', 1: 'yes'}

In [48]:
insurance['smoker'] = smoke_labels

In [49]:
insurance['smoker'].head()

0    1
1    0
2    0
3    0
4    0
Name: smoker, dtype: int32

In [50]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [51]:
place = np.unique(insurance['region'])
place

array(['northeast', 'northwest', 'southeast', 'southwest'], dtype=object)

In [52]:
pl = LabelEncoder()
place_labels = pl.fit_transform(insurance['region'])
place_mappings = {index: label for index, label in enumerate (pl.classes_)}
place_mappings

{0: 'northeast', 1: 'northwest', 2: 'southeast', 3: 'southwest'}

In [53]:
insurance['region'] = place_labels

In [54]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [55]:
insurance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
age         1338 non-null int64
sex         1338 non-null int32
bmi         1338 non-null float64
children    1338 non-null int64
smoker      1338 non-null int32
region      1338 non-null int32
charges     1338 non-null float64
dtypes: float64(2), int32(3), int64(2)
memory usage: 57.6 KB


In [56]:
insurance.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [58]:
copy = insurance.copy()

In [61]:
X = copy.drop(['charges'],axis = 1)

In [64]:
X.head()

Unnamed: 0,age,sex,bmi,children,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.77,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.88,0,0,1


In [65]:
y = copy['charges']

In [66]:
y.head()

0    16884.92400
1     1725.55230
2     4449.46200
3    21984.47061
4     3866.85520
Name: charges, dtype: float64

In [69]:
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X,y, test_size = 0.33, random_state = 101)

In [70]:
from sklearn import linear_model
reg = linear_model.LinearRegression()
reg.fit(train_X, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [71]:
from sklearn.metrics import r2_score
score = r2_score(reg.predict(test_X), test_y)
score

0.6615145452227753

In [76]:
import pickle

In [77]:
filename = 'model.pickle'
pickle.dump(reg, open(filename, 'wb'))

In [80]:
loaded_model = pickle.load(open(filename, 'rb'))
prediction = loaded_model.predict(([[30,0,27,2,1,3]]))
print(prediction)

[28161.26233543]
