# Regression

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [56]:
#read data
df = pd.read_csv("insurance_data.csv")

In [57]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [58]:
#Check datatype
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [59]:
df.shape

(1338, 7)

In [60]:
#Checking duplicates

print(df.shape)
print(df.duplicated().any())
#df[df.duplicated(keep=False)==True]
#df[df.duplicated()==True]
df.drop_duplicates(keep=False,inplace=True)
print(df.duplicated().any())
print(df.shape)

(1338, 7)
True
False
(1336, 7)


In [61]:
#Checking missing values
df.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [62]:
#Encoding
print('Shape before encoding: ',df.shape)
df = pd.get_dummies(df,drop_first=True)
print('Shape after encoding: ',df.shape)
df.head()

Shape before encoding:  (1336, 7)
Shape after encoding:  (1336, 9)


Unnamed: 0,age,bmi,children,charges,sex_male,smoker_yes,region_northwest,region_southeast,region_southwest
0,19,27.9,0,16884.924,0,1,0,0,1
1,18,33.77,1,1725.5523,1,0,0,1,0
2,28,33.0,3,4449.462,1,0,0,1,0
3,33,22.705,0,21984.47061,1,0,1,0,0
4,32,28.88,0,3866.8552,1,0,1,0,0


In [63]:
#Split
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, train_size = 0.7, random_state = 100)

In [64]:
#Scaling
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

num_vars=['age','bmi','children']

df_train[num_vars] = scaler.fit_transform(df_train[num_vars])
#show describe function after scaling

In [65]:
#X and Y split
X_train = df_train.drop(['charges'],axis=1)
y_train = df_train['charges']
print('Shape of X: ',X_train.shape)
print('Shape of y: ',y_train.shape)

Shape of X:  (935, 8)
Shape of y:  (935,)


In [66]:
#Modelling
from sklearn.linear_model import LinearRegression

lm = LinearRegression()
lm.fit(X_train, y_train)

r2 = lm.score(X_train,y_train)
print('R^2: {0}'.format(r2))

R^2: 0.7595751781166424


In [67]:
#Test Data

#Scaling
df_test[num_vars] = scaler.transform(df_test[num_vars])

#X and Y split
X_test = df_test.drop(['charges'],axis=1)
y_test = df_test['charges']
print('Shape of X: ',X_test.shape)
print('Shape of y: ',y_test.shape)

Shape of X:  (401, 8)
Shape of y:  (401,)


In [74]:
from sklearn.metrics import mean_squared_error,mean_absolute_error

y_pred=lm.predict(X_test)
mean_squared_error(y_test, y_pred)

37665145.246237

In [69]:
y_pred_train=lm.predict(X_train)
mean_squared_error(y_train, y_pred_train)

36197460.17511876

In [75]:
mean_absolute_error(y_test, y_pred)

4236.1361908202125