In [46]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [47]:
df = pd.read_csv('insurance.csv')
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [48]:
df.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

checking no. of unique labels for each variable(column)

In [49]:
for col in df.columns:
  print(col, ': ', len(df[col].unique()), 'labels')

age :  47 labels
sex :  2 labels
bmi :  548 labels
children :  6 labels
smoker :  2 labels
region :  4 labels
charges :  1337 labels


checking the frequency of each label (of 10 most frequent labels) for a particular column (in this case, sex)

In [50]:
df.sex.value_counts().sort_values(ascending=False).head(10)

male      676
female    662
Name: sex, dtype: int64

arranging it in descending order

In [51]:
sex_cat = [x for x in df.sex.value_counts().sort_values(ascending=False).head(10).index]
sex_cat

['male', 'female']

encoding the labels

In [52]:
for label in sex_cat:
  df[label] = np.where(df['sex']==label,1,0)

df[['sex']+sex_cat].head(2)

Unnamed: 0,sex,male,female
0,female,0,1
1,male,1,0


merging it with the df

In [53]:
def one_hot_top_x(df, variable, top_x_labels):

  for label in top_x_labels:
    df[variable+'_'+label] = np.where(df[variable]==label,1,0)

df = pd.read_csv('insurance.csv', usecols=['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'])

one_hot_top_x(df, 'sex', sex_cat)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_male,sex_female
0,19,female,27.9,0,yes,southwest,16884.924,0,1
1,18,male,33.77,1,no,southeast,1725.5523,1,0
2,28,male,33.0,3,no,southeast,4449.462,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0
4,32,male,28.88,0,no,northwest,3866.8552,1,0


In [54]:
smoker_cat = [x for x in df.smoker.value_counts().sort_values(ascending=False).head(10).index]
smoker_cat

['no', 'yes']

In [55]:
one_hot_top_x(df, 'smoker', smoker_cat)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_male,sex_female,smoker_no,smoker_yes
0,19,female,27.9,0,yes,southwest,16884.924,0,1,0,1
1,18,male,33.77,1,no,southeast,1725.5523,1,0,1,0
2,28,male,33.0,3,no,southeast,4449.462,1,0,1,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0,1,0
4,32,male,28.88,0,no,northwest,3866.8552,1,0,1,0


In [56]:
region_cat = [x for x in df.region.value_counts().sort_values(ascending=False).head(10).index]
region_cat

['southeast', 'southwest', 'northwest', 'northeast']

In [57]:
one_hot_top_x(df, 'region', region_cat)
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,sex_male,sex_female,smoker_no,smoker_yes,region_southeast,region_southwest,region_northwest,region_northeast
0,19,female,27.9,0,yes,southwest,16884.924,0,1,0,1,0,1,0,0
1,18,male,33.77,1,no,southeast,1725.5523,1,0,1,0,1,0,0,0
2,28,male,33.0,3,no,southeast,4449.462,1,0,1,0,1,0,0,0
3,33,male,22.705,0,no,northwest,21984.47061,1,0,1,0,0,0,1,0
4,32,male,28.88,0,no,northwest,3866.8552,1,0,1,0,0,0,1,0


dropping the original unencoded columns

In [58]:
df = df.drop(['sex'], axis=1)
df = df.drop(['smoker'], axis=1)
df = df.drop(['region'], axis=1)
df

Unnamed: 0,age,bmi,children,charges,sex_male,sex_female,smoker_no,smoker_yes,region_southeast,region_southwest,region_northwest,region_northeast
0,19,27.900,0,16884.92400,0,1,0,1,0,1,0,0
1,18,33.770,1,1725.55230,1,0,1,0,1,0,0,0
2,28,33.000,3,4449.46200,1,0,1,0,1,0,0,0
3,33,22.705,0,21984.47061,1,0,1,0,0,0,1,0
4,32,28.880,0,3866.85520,1,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,10600.54830,1,0,1,0,0,0,1,0
1334,18,31.920,0,2205.98080,0,1,1,0,0,0,0,1
1335,18,36.850,0,1629.83350,0,1,1,0,1,0,0,0
1336,21,25.800,0,2007.94500,0,1,1,0,0,1,0,0


Rearranging the df so as to shift the dependent variable to the last

In [59]:
df = df[['age', 'bmi', 'children', 'sex_male', 'sex_female', 'smoker_no', 'smoker_yes', 'region_southeast', 'region_southwest', 'region_northwest', 'region_northeast', 'charges']]

In [60]:
df

Unnamed: 0,age,bmi,children,sex_male,sex_female,smoker_no,smoker_yes,region_southeast,region_southwest,region_northwest,region_northeast,charges
0,19,27.900,0,0,1,0,1,0,1,0,0,16884.92400
1,18,33.770,1,1,0,1,0,1,0,0,0,1725.55230
2,28,33.000,3,1,0,1,0,1,0,0,0,4449.46200
3,33,22.705,0,1,0,1,0,0,0,1,0,21984.47061
4,32,28.880,0,1,0,1,0,0,0,1,0,3866.85520
...,...,...,...,...,...,...,...,...,...,...,...,...
1333,50,30.970,3,1,0,1,0,0,0,1,0,10600.54830
1334,18,31.920,0,0,1,1,0,0,0,0,1,2205.98080
1335,18,36.850,0,0,1,1,0,1,0,0,0,1629.83350
1336,21,25.800,0,0,1,1,0,0,1,0,0,2007.94500


In [61]:
x = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

In [62]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [63]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression()

In [64]:
y_pred = regressor.predict(x_test)
np.set_printoptions(precision=2)
print(np.concatenate((y_test.reshape(len(y_test),1), y_pred.reshape(len(y_pred),1)),1))

[[9.72e+03 1.12e+04]
 [8.55e+03 9.49e+03]
 [4.57e+04 3.82e+04]
 [1.30e+04 1.63e+04]
 [9.64e+03 6.91e+03]
 [4.50e+03 3.96e+03]
 [2.20e+03 1.58e+03]
 [1.14e+04 1.44e+04]
 [7.54e+03 9.01e+03]
 [5.43e+03 7.51e+03]
 [6.75e+03 4.49e+03]
 [1.05e+04 1.03e+04]
 [7.34e+03 8.80e+03]
 [4.19e+03 3.80e+03]
 [1.83e+04 2.79e+04]
 [1.07e+04 1.07e+04]
 [1.25e+04 1.13e+04]
 [3.49e+03 6.11e+03]
 [6.46e+03 8.24e+03]
 [3.35e+04 2.71e+04]
 [2.40e+04 3.36e+04]
 [1.26e+04 1.44e+04]
 [2.30e+04 1.17e+04]
 [2.31e+04 3.21e+04]
 [1.67e+03 4.17e+03]
 [4.67e+03 9.25e+03]
 [3.73e+03 1.08e+03]
 [7.68e+03 9.80e+03]
 [3.76e+03 3.77e+03]
 [8.41e+03 1.04e+04]
 [8.06e+03 9.01e+03]
 [4.90e+04 4.01e+04]
 [1.30e+04 1.57e+04]
 [2.06e+04 1.39e+04]
 [1.46e+04 2.48e+04]
 [4.14e+03 5.17e+03]
 [8.35e+03 1.26e+04]
 [5.12e+04 3.08e+04]
 [4.00e+04 3.35e+04]
 [1.88e+03 3.67e+03]
 [5.46e+03 3.98e+03]
 [2.87e+03 3.99e+03]
 [2.01e+04 3.05e+04]
 [4.75e+04 3.95e+04]
 [3.61e+04 2.78e+04]
 [2.60e+04 5.09e+03]
 [1.97e+04 1.06e+04]
 [6.94e+03 7.

In [65]:
print(regressor.predict([[21, 25.800, 0, 0, 1, 1, 0, 0, 1, 0, 0]]))

[1405.53]


In [66]:
print(regressor.predict([[61, 29.070, 0, 0, 1, 0, 1, 0, 0, 1, 0]]))

[36758.98]


In [67]:
regressor.intercept_

-517.1368358425534

In [68]:
regressor.coef_

array([ 2.54e+02,  3.36e+02,  4.37e+02, -7.73e+00,  7.73e+00, -1.18e+04,
        1.18e+04, -4.29e+02, -2.78e+02,  2.24e+02,  4.84e+02])

In [69]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

31827950.229523823