In [3]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

# Loading Heart Dataset

In [2]:
df = pd.read_csv('heart_disease.csv')
df.sample(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
162,41,1,1,120,157,0,1,182,0,0.0,2,0,2,1
54,63,0,2,135,252,0,0,172,0,0.0,2,0,2,1
159,56,1,1,130,221,0,0,163,0,0.0,2,0,3,1
118,46,0,1,105,204,0,1,172,0,0.0,2,0,2,1
82,60,0,2,102,318,0,1,160,0,0.0,2,1,2,1


# Feature descriptions
### Below is the group of features presents in the dataset segregated by their type (numerical, categorical, ordinal, binary)

# Binary
### sex (0 = female; 1 = male)
### fbs: Fasting blood sugar > 120 mg/dl
### exang: Exercise induced angina (0 = no; 1 = yes)
# Categorical
### cp: Chest pain type (0 = Asymptomatic angina; 1 = Atypical angina; 2 = Non-angina; 3 = Typical angina)
### restecg: Resting ECG (0 = Left ventricular hypertrophy; 1 = Normal; 2 = ST-T wave abnormality)
### slope: Slope of the peak exercise ST segment (0 = downsloping; 1 = upsloping; 2 = flat)
### thal: Thalium stress test result (0 = NA; 1 = Fixed defect; 2 = Normal; 3 = Reversible defect)
# Ordinal
### ca: number of major vessels (0-3) colored by flourosopy
# Numeric
### age
### oldpeak: ST depression induced by exercise relative to rest
### trestbps: Resting blood pressure
### chol: Serum cholestoral in mg/dl
### thalach: Maximum heart rate achieved during thalium stress test
# Target
### target: 1 = heart disease; 0 = no heart disease

In [3]:
bins = ['sex', 'fbs', 'exang']
cats = ['cp', 'restecg', 'slope', 'thal']
ords = ['ca']
nums = ['age', 'oldpeak', 'trestbps', 'chol', 'thalach']
target = ['target']

In [4]:
# one hot encoding the categorical variables for easy data exploration, you may reverse it later to numeric value for model training and testing

df.cp = df.cp.replace({0:'Asympt.', 1:'Atypical', 2:'Non', 3:'Typical'})
df.restecg = df.restecg.replace({0:'LV hyper', 1:'Normal', 2:'ST-T wave'})
df.slope = df.slope.replace({0:'down', 1:'up', 2:'flat'})
df.thal = df.thal.replace({0:'NA', 1:'Fixed', 2:'Normal', 3:'Revers.'})

In [5]:
df.describe()

Unnamed: 0,age,sex,trestbps,chol,fbs,thalach,exang,oldpeak,ca,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.366337,0.683168,131.623762,246.264026,0.148515,149.646865,0.326733,1.039604,0.729373,0.544554
std,9.082101,0.466011,17.538143,51.830751,0.356198,22.905161,0.469794,1.161075,1.022606,0.498835
min,29.0,0.0,94.0,126.0,0.0,71.0,0.0,0.0,0.0,0.0
25%,47.5,0.0,120.0,211.0,0.0,133.5,0.0,0.0,0.0,0.0
50%,55.0,1.0,130.0,240.0,0.0,153.0,0.0,0.8,0.0,1.0
75%,61.0,1.0,140.0,274.5,0.0,166.0,1.0,1.6,1.0,1.0
max,77.0,1.0,200.0,564.0,1.0,202.0,1.0,6.2,4.0,1.0


# Load Iris Datset

# Feature descriptions
### The columns in this dataset are:

### Id
### SepalLengthCm
### SepalWidthCm
### PetalLengthCm
### PetalWidthCm
### Species

In [4]:
df_iris = pd.read_csv('iris_dataset.csv')

In [5]:
df_iris.describe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,105.0,105.0,105.0,105.0
mean,5.873333,3.050476,3.785714,1.204762
std,0.862941,0.454068,1.782793,0.778853
min,4.3,2.0,1.1,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.2,1.3
75%,6.4,3.3,5.1,1.9
max,7.9,4.4,6.9,2.5


In [6]:
df_iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,2.5,3.0,1.1,Iris-versicolor
1,6.2,2.2,4.5,1.5,Iris-versicolor
2,5.1,3.8,1.5,0.3,Iris-setosa
3,6.8,3.2,5.9,2.3,Iris-virginica
4,5.7,2.8,4.1,1.3,Iris-versicolor


In [7]:
df_iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')