# Data science process

#### Steps
- Problem definition
- Data acquisition
- Data preparation and EDA (Exploratory Data Analysis)
- Feature Engineering and Feature extraction
- Model planning
- Model building
- Model Evaluation
- Model Deployment

# (1) Problem definition

In [None]:
# Predict Medical Cost based on various personal and lifestyle factors
# Construct regression model that predicts the medical cost using the feature provided

In [None]:
# Explain Linear Regression Model 

# (2) Data acquisition

In [None]:
# Pre Acquired dataset
# Describe all the features (columns) in the dataset
# dataset consists of the following features which are used to predict the "charges"
'age', 'sex', 'bmi', 'children', 'smoker', 'region'
# Dataset consists of 1338 records 

In [1]:
# import the required packages

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns
import plotly.express as px

from sklearn.linear_model import LinearRegression


import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load the dataset

data=pd.read_csv("insurance.csv")

# Data Exploration

In [4]:
data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [5]:
data.columns

Index(['age', 'sex', 'bmi', 'children', 'smoker', 'region', 'charges'], dtype='object')

In [6]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [7]:
data.shape

(1338, 7)

In [10]:
data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [13]:
print("Object type data columns",data.select_dtypes(include='object').shape[1])

Object type data columns 3


In [15]:
# Summary of numeric (int and float) data
data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [16]:
# Summary of object data
data.describe(include='object')

Unnamed: 0,sex,smoker,region
count,1338,1338,1338
unique,2,2,4
top,male,no,southeast
freq,676,1064,364


In [18]:
# Frequency
data['sex'].value_counts()

male      676
female    662
Name: sex, dtype: int64

In [22]:
# Frequency
data['smoker'].value_counts()

no     1064
yes     274
Name: smoker, dtype: int64

In [23]:
# Duplicate data
# data[data.duplicated()==True]
data[data.duplicated()].sum()

age                19
sex              male
bmi             30.59
children            0
smoker             no
region      northwest
charges     1639.5631
dtype: object

# Day-9 Regression Case study-contd

In [8]:
data['children'].value_counts()

0    574
1    324
2    240
3    157
4     25
5     18
Name: children, dtype: int64

In [12]:
data[data['age']==18]

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
1,18,male,33.770,1,no,southeast,1725.55230
22,18,male,34.100,0,no,southeast,1137.01100
31,18,female,26.315,0,no,northeast,2198.18985
46,18,female,38.665,2,no,northeast,3393.35635
50,18,female,35.625,0,no,northeast,2211.13075
...,...,...,...,...,...,...,...
1296,18,male,26.125,0,no,northeast,1708.92575
1315,18,male,28.310,1,no,northeast,11272.33139
1317,18,male,53.130,0,no,southeast,1163.46270
1334,18,female,31.920,0,no,northeast,2205.98080


In [9]:
data['age'].value_counts()

18    69
19    68
50    29
51    29
47    29
46    29
45    29
20    29
48    29
52    29
22    28
49    28
54    28
53    28
21    28
26    28
24    28
25    28
28    28
27    28
23    28
43    27
29    27
30    27
41    27
42    27
44    27
31    27
40    27
32    26
33    26
56    26
34    26
55    26
57    26
37    25
59    25
58    25
36    25
38    25
35    25
39    25
61    23
60    23
63    23
62    23
64    22
Name: age, dtype: int64

In [13]:
# rename the column
data.rename(columns={'sex':'gender'},inplace=True)

In [14]:
data.head()

Unnamed: 0,age,gender,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [15]:
# rename the column
data.rename(columns={'children':'dependents'},inplace=True)

In [16]:
data.head()

Unnamed: 0,age,gender,bmi,dependents,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [17]:
#Identify the unique values
data['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

# (3) Data Preparation

# Clean the data


In [None]:
# Identify the Missing data