# Data science process

#### Steps
- Problem definition
- Data acquisition
- Data preparation and EDA (Exploratory Data Analysis)
- Feature Engineering and Feature extraction
- Model planning
- Model building
- Model Evaluation
- Model Deployment

# (1) Problem definition

In [1]:
# Predict Medical Cost based on various personal and lifestyle factors
# Construct regression model that predicts the medical cost using the feature provided

In [2]:
# Explain Linear Regression Model 

# (2) Data acquisition

In [3]:
# Pre Acquired dataset
# Describe all the features (columns) in the dataset
# dataset consists of the following features which are used to predict the "charges"
'age', 'sex', 'bmi', 'children', 'smoker', 'region'
# Dataset consists of 1338 records 

('age', 'sex', 'bmi', 'children', 'smoker', 'region')

In [4]:
# import the required packages

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

In [5]:
# Load the dataset

data=pd.read_csv("insurance.csv")

# Data Exploration

In [6]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [7]:
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [9]:
data.shape

(1338, 7)

In [10]:
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [11]:
print("Object type data columns",data.select_dtypes(include='object').shape[1])

Object type data columns 3


In [12]:
# Summary of numeric (int and float) data
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [13]:
# Summary of object data
data.describe(include='object')

Unnamed: 0,sex,smoker,region
count,1338,1338,1338
unique,2,2,4
top,male,no,southeast
freq,676,1064,364


In [14]:
# Frequency
data['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [15]:
# Frequency
data['smoker'].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [16]:
# Duplicate data
# data[data.duplicated()==True]
data[data.duplicated()].sum()

age                19
sex              male
bmi             30.59
children            0
smoker             no
region      northwest
charges     1639.5631
dtype: object

# Day-9 Regression Case study-contd

In [17]:
data['children'].value_counts()

0    574
1    324
2    240
3    157
4     25
5     18
Name: children, dtype: int64

In [18]:
data[data['age']==18]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1,18,male,33.770,1,no,southeast,1725.55230
22,18,male,34.100,0,no,southeast,1137.01100
31,18,female,26.315,0,no,northeast,2198.18985
46,18,female,38.665,2,no,northeast,3393.35635
50,18,female,35.625,0,no,northeast,2211.13075
...,...,...,...,...,...,...,...
1296,18,male,26.125,0,no,northeast,1708.92575
1315,18,male,28.310,1,no,northeast,11272.33139
1317,18,male,53.130,0,no,southeast,1163.46270
1334,18,female,31.920,0,no,northeast,2205.98080


In [19]:
data['age'].value_counts()

18    69
19    68
50    29
51    29
47    29
46    29
45    29
20    29
48    29
52    29
22    28
49    28
54    28
53    28
21    28
26    28
24    28
25    28
28    28
27    28
23    28
43    27
29    27
30    27
41    27
42    27
44    27
31    27
40    27
32    26
33    26
56    26
34    26
55    26
57    26
37    25
59    25
58    25
36    25
38    25
35    25
39    25
61    23
60    23
63    23
62    23
64    22
Name: age, dtype: int64

In [20]:
# rename the column
data.rename(columns={'sex':'gender'},inplace=True)

In [21]:
data.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [22]:
# rename the column
data.rename(columns={'children':'dependents'},inplace=True)

In [23]:
data.head()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [24]:
#Identify the unique values
data['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [25]:
# Group by gender
data.groupby('gender').agg(['min','max'])['charges']

Unnamed: 0_level_0,min,max
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,1607.5101,63770.42801
male,1121.8739,62592.87309


In [26]:
# pivot dependents and gender
data.pivot_table(index='dependents',columns='gender',values='charges')

gender,female,male
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11905.714276,12832.696736
1,12161.360414,13273.522458
2,13941.317326,16187.095325
3,13865.605066,16789.167419
4,13937.674562,13782.284829
5,9854.006419,7931.65831


# (3) Data Preparation

# Clean the data


In [27]:
# Identify the Missing data under each column
data.isnull()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
1333,False,False,False,False,False,False,False
1334,False,False,False,False,False,False,False
1335,False,False,False,False,False,False,False
1336,False,False,False,False,False,False,False


In [28]:
# count the Missing data under each column
data.isnull().sum()

age           0
gender        0
bmi           0
dependents    0
smoker        0
region        0
charges       0
dtype: int64

In [29]:
# Total Missing data 
data.isnull().sum().sum()

0

In [30]:
#data1=pd.read_csv('titanic.csv')

In [31]:
#data1.isnull().sum()

In [32]:
#data1.head(10)

In [33]:
#  total missing vvalues in the dataset
#data1.isnull().sum().sum()

In [34]:
# Correlation
data.head()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [35]:
# Correlation
# data_copy=data

In [36]:
# data_copy.drop(['gender','dependents','smoker','region'],inplace=True, axis=1)

In [37]:
# data_copy.corr()

In [38]:
data.head()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


# Feature Engineering

In [39]:
# Create a new feature / attribute / column
# Transform the existing feature
# Feature can be numeric, categorical

In [40]:
# Variable Transformation
# Categorical Variable Transformations -Encoding Techniques
# Numeric Variable Transformation - Standardization (mean 0 and standard eviation 1) and Normalization (fix the scale)

In [41]:
# Encoding Techniques: 
# label encoding, one hot encoding, binary encoding, 
# ordinal encoding, hash encoding, Regression encoding, frequency encoding, mean encoding, .....
# for converting data values from categorical to numeric
# Gender Male and Female can be taken as  0 and 1

# data is ordinal mean we use label encoding

In [1]:
from sklearn.preprocessing import LabelEncoder

In [43]:
data.head()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [44]:
gender_enc=LabelEncoder()
data['gender']=gender_enc.fit_transform(data['gender'])

In [45]:
data.head()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,19,0,27.9,0,yes,southwest,16884.924
1,18,1,33.77,1,no,southeast,1725.5523
2,28,1,33.0,3,no,southeast,4449.462
3,33,1,22.705,0,no,northwest,21984.47061
4,32,1,28.88,0,no,northwest,3866.8552


In [46]:
data.dtypes

age             int64
gender          int32
bmi           float64
dependents      int64
smoker         object
region         object
charges       float64
dtype: object

In [47]:
smoker_enc=LabelEncoder()
data['smoker']=smoker_enc.fit_transform(data['smoker'])

In [48]:
data.dtypes

age             int64
gender          int32
bmi           float64
dependents      int64
smoker          int32
region         object
charges       float64
dtype: object

In [49]:
data.head()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [50]:
region_enc=LabelEncoder()
data['region']=region_enc.fit_transform(data['region'])

In [51]:
data.head()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,19,0,27.9,0,1,3,16884.924
1,18,1,33.77,1,0,2,1725.5523
2,28,1,33.0,3,0,2,4449.462
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.88,0,0,1,3866.8552


In [52]:
data['region'].value_counts()

2    364
3    325
1    325
0    324
Name: region, dtype: int64

In [61]:
# Charges Transformation using log Transformation to bring to same scale
data['charges']=np.log(data['charges'])

In [62]:
data.head()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,19,0,27.9,0,1,3,9.734176
1,18,1,33.77,1,0,2,7.453302
2,28,1,33.0,3,0,2,8.400538
3,33,1,22.705,0,0,1,9.998092
4,32,1,28.88,0,0,1,8.260197


In [63]:
#data['charges']=np.exp(data['charges'])

In [64]:
#data.head()

# (4) Model : Linear Regression  (Multiple)

In [65]:
from sklearn.linear_model import LinearRegression

In [66]:
# Independent variables
X=data.drop('charges',axis=1)
X.head()


Unnamed: 0,age,gender,bmi,dependents,smoker,region
0,19,0,27.9,0,1,3
1,18,1,33.77,1,0,2
2,28,1,33.0,3,0,2
3,33,1,22.705,0,0,1
4,32,1,28.88,0,0,1


In [67]:
# dependent variable 
Y=data['charges']
Y.head()

0    9.734176
1    7.453302
2    8.400538
3    9.998092
4    8.260197
Name: charges, dtype: float64

In [68]:
model=LinearRegression()
model.fit(X,Y)

In [72]:
# Regression equation parameters
# age	gender	bmi	dependents	smoker	region (b1,b2,b3,b4,b5,b6)
# intercept b0
# y=b0+b1x1+b2x2+.....b6x6
list(model.coef_)

[0.034655621670689536,
 -0.07534135906195492,
 0.012282576672003276,
 0.10239499732855606,
 1.5498383883584277,
 -0.04760667131383818]

In [70]:
model.intercept_

7.044016346538668

In [None]:
# Construct the Regression equation (home work)

In [71]:
# Prediction

Y_pred=model.predict(X)

In [73]:
Y_pred

array([ 9.45217542,  8.01444045,  8.55632907, ...,  8.02521714,
        7.94585487, 11.01729549])