In [1]:
import pandas as pd 
df = pd.read_csv('./data/heart.csv')

# Preprocessing

In [2]:
# Show data info
print(df.head())
print(df.info())

# Show null values each column
df.isnull().sum()

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   52    1   0       125   212    0        1      168      0      1.0      2   
1   53    1   0       140   203    1        0      155      1      3.1      0   
2   70    1   0       145   174    0        1      125      1      2.6      0   
3   61    1   0       148   203    0        1      161      0      0.0      2   
4   62    0   0       138   294    1        1      106      0      1.9      1   

   ca  thal  target  
0   2     3       0  
1   0     3       0  
2   0     3       0  
3   1     3       0  
4   3     2       0  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1025 entries, 0 to 1024
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1025 non-null   int64  
 1   sex       1025 non-null   int64  
 2   cp        1025 non-null   int64  
 3   trestbps  1025 non-null   int64  
 4   chol      1025 non-null   int64  
 

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

## Category columns 
As in the cell above, we see that all the columns in df are all numerical values. But when we look closer, columns like ```['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']```
are categorical so we must apply strategies to handle it.


**Why?** Because the values ​​in these column have no ordered division, for example, gender cannot be `1` for male and `0` for female

**Strategy**: flatten it out using ```sklearn.preprocessing.OneHotEncoder``` or ```pandas.get_dummies```

In [3]:
category_columns = ['sex', 'cp', 'fbs', 'restecg', 'exang', 'slope', 'ca', 'thal']
df = pd.get_dummies(df, columns=category_columns)

df.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_0,sex_1,cp_0,cp_1,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,52,125,212,168,1.0,0,False,True,True,False,...,True,False,False,True,False,False,False,False,False,True
1,53,140,203,155,3.1,0,False,True,True,False,...,False,True,False,False,False,False,False,False,False,True
2,70,145,174,125,2.6,0,False,True,True,False,...,False,True,False,False,False,False,False,False,False,True
3,61,148,203,161,0.0,0,False,True,True,False,...,True,False,True,False,False,False,False,False,False,True
4,62,138,294,106,1.9,0,True,False,True,False,...,False,False,False,False,True,False,False,False,True,False


## Numerical columns
Some columns ```['age', 'trestbps', 'chol', 'thalach', 'oldpeak']``` is numerical columns need to be scaled.

**Why?**For treating all features equally and is not unduly influenced by features with large values ​​or different scales.

**Strategy**: Scaling using ```sklearn.preprocessing.StandardScaler```

In [4]:
from sklearn.preprocessing import StandardScaler

standardScaler         = StandardScaler()
columns_to_scale       = ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']
df[columns_to_scale] = standardScaler.fit_transform(df[columns_to_scale])

df.head()

Unnamed: 0,age,trestbps,chol,thalach,oldpeak,target,sex_0,sex_1,cp_0,cp_1,...,slope_2,ca_0,ca_1,ca_2,ca_3,ca_4,thal_0,thal_1,thal_2,thal_3
0,-0.268437,-0.377636,-0.659332,0.821321,-0.060888,0,False,True,True,False,...,True,False,False,True,False,False,False,False,False,True
1,-0.158157,0.479107,-0.833861,0.255968,1.727137,0,False,True,True,False,...,False,True,False,False,False,False,False,False,False,True
2,1.716595,0.764688,-1.396233,-1.048692,1.301417,0,False,True,True,False,...,False,True,False,False,False,False,False,False,False,True
3,0.724079,0.936037,-0.833861,0.5169,-0.912329,0,False,True,True,False,...,True,False,True,False,False,False,False,False,False,True
4,0.834359,0.364875,0.930822,-1.874977,0.705408,0,True,False,True,False,...,False,False,False,False,True,False,False,False,True,False


## Spliting
Using ```train_test_split``` with rate 0.8/0.2.

In [5]:
from sklearn.model_selection import train_test_split

y = df['target']
X = df.drop(['target'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    train_size = 0.8,
                                                    test_size=0.2,
                                                    random_state=0)


# Predicting

## 1.KNN 


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("accuracy_score" , accuracy_score(y_test, y_pred)*100, "%")

accuracy_score 100.0 %


## 2.SVM

In [7]:
#SVM
from sklearn.svm import SVC
model = SVC()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("accuracy_score" , accuracy_score(y_test, y_pred)*100, "%")

accuracy_score 96.58536585365853 %


## 3.Descision Tree Regressor

In [8]:
#Decision Tree
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("accuracy_score" , accuracy_score(y_test, y_pred)*100, "%")


accuracy_score 100.0 %


# Random Forest

In [9]:
# Random forest 
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("accuracy_score" , accuracy_score(y_test, y_pred)*100, "%")

accuracy_score 100.0 %
