# **Import Libraries**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
kyphosis_df = pd.read_csv('kyphosis.csv')

In [3]:
kyphosis_df.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15


In [4]:
kyphosis_df.tail()

Unnamed: 0,Kyphosis,Age,Number,Start
76,present,157,3,13
77,absent,26,7,13
78,absent,120,2,13
79,present,42,7,6
80,absent,36,4,13


In [5]:
kyphosis_df.describe()

Unnamed: 0,Age,Number,Start
count,81.0,81.0,81.0
mean,83.654321,4.049383,11.493827
std,58.104251,1.619423,4.883962
min,1.0,2.0,1.0
25%,26.0,3.0,9.0
50%,87.0,4.0,13.0
75%,130.0,5.0,16.0
max,206.0,10.0,18.0


1달~206/12=17.2살까지

In [6]:
kyphosis_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81 entries, 0 to 80
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Kyphosis  81 non-null     object
 1   Age       81 non-null     int64 
 2   Number    81 non-null     int64 
 3   Start     81 non-null     int64 
dtypes: int64(3), object(1)
memory usage: 2.7+ KB


# **Visualize Dataset**

In [7]:
#kyphosis_df['Kyphosis']의 내용으로 그래프를 그리고, y축 기준은 count로 지정
sns.countplot(kyphosis_df['Kyphosis'], label = 'Count');

In [8]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
#LabelEncoder는 문자열을 숫자로 바꾸어서 저장해주는 Encoder의 역할을 함
LabelEncoder_y = LabelEncoder()

kyphosis_df['Kyphosis'] = LabelEncoder_y.fit_transform(kyphosis_df['Kyphosis'])

In [9]:
kyphosis_df.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,0,71,3,5
1,0,158,3,14
2,1,128,4,5
3,0,2,5,1
4,0,1,4,15


In [10]:
kyphosis_df.tail()

Unnamed: 0,Kyphosis,Age,Number,Start
76,1,157,3,13
77,0,26,7,13
78,0,120,2,13
79,1,42,7,6
80,0,36,4,13


In [11]:
#수술 후에 척추 후만증에 걸리지 않은 사람
kyphosis_False = kyphosis_df[kyphosis_df['Kyphosis']==0]

In [12]:
kyphosis_False.head()

Unnamed: 0,Kyphosis,Age,Number,Start
0,0,71,3,5
1,0,158,3,14
3,0,2,5,1
4,0,1,4,15
5,0,1,2,16


In [13]:
print("Disease absent after operation percentage = ", (len(kyphosis_False)/len(kyphosis_df))*100, '%')

Disease absent after operation percentage =  79.01234567901234 %


In [14]:
kyphosis_True = kyphosis_df[kyphosis_df['Kyphosis']==1]

In [15]:
print("Disease present after operation percentage = ", (len(kyphosis_True)/len(kyphosis_df))*100, '%')

Disease present after operation percentage =  20.98765432098765 %


In [16]:
 #Correlation
 #annot는 각 상자 안에 값을 표기
 sns.heatmap(kyphosis_df.corr(), annot = True)

<AxesSubplot:>

1열을 보면, 각 변수간의 상관관계를 볼 수 있다

In [17]:
#Visualize 
#hue는 색깔 구분, vars는 나타낼 변수
sns.pairplot(kyphosis_df, hue = 'Kyphosis', vars = ['Age','Number','Start'])

<seaborn.axisgrid.PairGrid at 0x28b81796d60>

# **Testing and Training dataset**

In [18]:
x = kyphosis_df.drop(['Kyphosis'], axis = 1)
x

Unnamed: 0,Age,Number,Start
0,71,3,5
1,158,3,14
2,128,4,5
3,2,5,1
4,1,4,15
...,...,...,...
76,157,3,13
77,26,7,13
78,120,2,13
79,42,7,6


In [19]:
y = kyphosis_df['Kyphosis']
y

0     0
1     0
2     1
3     0
4     0
     ..
76    1
77    0
78    0
79    1
80    0
Name: Kyphosis, Length: 81, dtype: int32

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.3)

# **Training Model**

In [22]:
x_train.shape

(56, 3)

In [23]:
#56행, 1열
y_train.shape

(56,)

In [24]:
x_test.shape

(25, 3)

In [25]:
y_test.shape

(25,)

In [26]:
#DecisionTreeClassifier사용, DecisionTree를 활용한 분류기
from sklearn.tree import DecisionTreeClassifier
decision_tree = DecisionTreeClassifier()
decision_tree.fit(x_train, y_train)

DecisionTreeClassifier()

# **Evaluating the model**

In [27]:
from sklearn.metrics import classification_report, confusion_matrix

In [28]:
y_predict_test = decision_tree.predict(x_test)

In [29]:
y_predict_test

array([0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0])

In [30]:
y_test

76    1
22    1
58    0
1     0
52    1
46    0
36    0
54    0
65    0
51    0
48    1
41    0
70    0
10    1
67    0
14    0
6     0
8     0
23    0
15    0
7     0
4     0
40    1
5     0
56    0
Name: Kyphosis, dtype: int32

In [31]:
cm = confusion_matrix(y_test, y_predict_test)

In [32]:
sns.heatmap(cm, annot = True)

<AxesSubplot:>

x축은 actual, y축은 prediction

True는 예측이 맞다, False는 틀리다
N은 0, P는 1

TN    FN

FP    TP

In [33]:
print(classification_report(y_test,y_predict_test))

              precision    recall  f1-score   support

           0       0.77      0.89      0.83        19
           1       0.33      0.17      0.22         6

    accuracy                           0.72        25
   macro avg       0.55      0.53      0.53        25
weighted avg       0.67      0.72      0.68        25



Precision = =true positive/true positive + negative positive

Recall = true positive/ true positive + negative false

1일때 1/1일때 1 + 0일때 1 (전체 예측률)

F1 = 2*(Recall * Precision)/Recall + Precision

# **Training model2**

In [34]:
#Random Forest Classifier 
from sklearn.ensemble import RandomForestClassifier
RandomForest = RandomForestClassifier(n_estimators = 150)
RandomForest.fit(x_train, y_train)

RandomForestClassifier(n_estimators=150)

In [35]:
y_predict_test = RandomForest.predict(x_test)

In [36]:
cm = confusion_matrix(y_test, y_predict_test)
sns.heatmap(cm,annot = True)

<AxesSubplot:>

In [37]:
print(classification_report(y_test,y_predict_test))

              precision    recall  f1-score   support

           0       0.78      0.95      0.86        19
           1       0.50      0.17      0.25         6

    accuracy                           0.76        25
   macro avg       0.64      0.56      0.55        25
weighted avg       0.71      0.76      0.71        25



In [38]:
#DecisionTreeClassifier
#precision    recall  f1-score   support

#           0       0.93      0.65      0.76        20
#           1       0.36      0.80      0.50         5

#    accuracy                           0.68        25
#   macro avg       0.65      0.73      0.63        25
#weighted avg       0.82      0.68      0.71        25