<a href="https://colab.research.google.com/github/sjungmin98/study_AIs/blob/main/docs/quests/MLs/SpineSurgeryList_FeatureEngine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 데이터
- 목표변수(target) : '재발여부'
- 설명변수(features) :
- 수치형 : '체중', 'PI', '지방축적도', '과거수술횟수', '연령'
- 범주형 : '흡연여부', '당뇨여부'

In [1]:
import pandas as pd
df_SSL = pd.read_csv('SpineSurgeryList.csv')
df_SSL[:2]

Unnamed: 0.1,Unnamed: 0,환자ID,Large Lymphocyte,Location of herniation,ODI,가족력,간질성폐질환,고혈압여부,과거수술횟수,당뇨여부,...,Modic change,PI,PT,Seg Angle(raw),Vaccum disc,골밀도,디스크단면적,디스크위치,척추이동척도,척추전방위증
0,0,1PT,22.8,3,51.0,0.0,0,0,0,0,...,3,51.6,36.6,14.4,0,-1.01,2048.5,4,Down,0
1,1,2PT,44.9,4,26.0,0.0,0,0,0,0,...,0,40.8,7.2,17.8,0,-1.14,1753.1,4,Up,0


In [2]:
df_SSL.columns

Index(['Unnamed: 0', '환자ID', 'Large Lymphocyte', 'Location of herniation',
       'ODI', '가족력', '간질성폐질환', '고혈압여부', '과거수술횟수', '당뇨여부', '말초동맥질환여부', '빈혈여부',
       '성별', '스테로이드치료', '신부전여부', '신장', '심혈관질환', '암발병여부', '연령', '우울증여부', '입원기간',
       '입원일자', '종양진행여부', '직업', '체중', '퇴원일자', '헤모글로빈수치', '혈전합병증여부', '환자통증정도',
       '흡연여부', '통증기간(월)', '수술기법', '수술시간', '수술실패여부', '수술일자', '재발여부', '혈액형',
       '전방디스크높이(mm)', '후방디스크높이(mm)', '지방축적도', 'Instability', 'MF + ES',
       'Modic change', 'PI', 'PT', 'Seg Angle(raw)', 'Vaccum disc', '골밀도',
       '디스크단면적', '디스크위치', '척추이동척도', '척추전방위증'],
      dtype='object')

### 데이터 전처리(Pre-Processing)

In [3]:
df_SSL_extract = df_SSL.loc[:,['재발여부', '체중', 'PI', '지방축적도', '과거수술횟수', '연령', '흡연여부', '당뇨여부']].dropna()
df_SSL_extract.isnull().sum()

재발여부      0
체중        0
PI        0
지방축적도     0
과거수술횟수    0
연령        0
흡연여부      0
당뇨여부      0
dtype: int64

## 특성공학(Featuree Engineering)
- 수치 평준화 : 수치형과 범주형 각각 적용
- 갯수 균형화 : target 수량 균형화

### Scaling 수치형에 적용
- Standard Scaling : 평균 0, 표준편차 1
- Min-Max Scaling : 0-1 사이
- Robust Scaling : 이상치가 많은 데이터 셋(중앙값 기준)

In [4]:
df_SSL_extract['체중'].unique()[:3]

array([60.3, 71.7, 77.1])

In [5]:
df_SSL_extract['PI'].unique()[:3]

array([51.6, 40.8, 67.5])

In [6]:
df_SSL_extract['지방축적도'].unique()[:3]

array([282.3, 177.3, 256.8])

In [7]:
df_SSL_extract['과거수술횟수'].unique()[:3]

array([0, 1, 2])

In [8]:
df_SSL_extract['연령'].unique()[:3]

array([66, 47, 39])

In [9]:
# 수치형 min-max scaling
from sklearn.preprocessing import MinMaxScaler
minMaxScaler = MinMaxScaler()
features = ['체중', 'PI', '지방축적도', '과거수술횟수', '연령']
for feature in features:
    df_SSL_extract[feature] = minMaxScaler.fit_transform(df_SSL_extract[feature].values.reshape(-1,1))
df_SSL_extract[feature]

0       0.718310
1       0.450704
2       0.338028
3       0.352113
4       0.380282
          ...   
1889    0.619718
1890    0.380282
1891    0.647887
1892    0.197183
1893    0.211268
Name: 연령, Length: 1887, dtype: float64

### Onehot encoding 범주형에 적용

In [10]:
df_SSL_extract['흡연여부'].unique()

array([0, 1])

In [11]:
df_SSL_extract['당뇨여부'].unique()

array([0, 1])

In [18]:
from sklearn.preprocessing import OneHotEncoder
oneHotEncoder = OneHotEncoder()
x = df_SSL_extract[['흡연여부', '당뇨여부']]
oneHotEncoder.fit(x)

In [29]:
df_encoder = pd.DataFrame(data= oneHotEncoder.transform(x).toarray(), columns=oneHotEncoder.get_feature_names_out())
result = pd.concat([df_SSL_extract.reset_index(drop=True), df_encoder.reset_index(drop=True)], axis=1)
result.drop(['흡연여부', '당뇨여부'], axis=1, inplace=True)
result

Unnamed: 0,재발여부,체중,PI,지방축적도,과거수술횟수,연령,흡연여부_0,흡연여부_1,당뇨여부_0,당뇨여부_1
0,0,0.250247,0.072564,0.021206,0.000000,0.718310,1.0,0.0,1.0,0.0
1,0,0.363007,0.052824,0.012520,0.000000,0.450704,1.0,0.0,1.0,0.0
2,0,0.416419,0.101627,0.019097,0.333333,0.338028,1.0,0.0,1.0,0.0
3,0,0.387735,0.068178,0.018542,0.000000,0.352113,1.0,0.0,1.0,0.0
4,0,0.452028,0.085725,0.017061,0.000000,0.380282,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...
1882,0,0.286845,0.087004,0.017500,0.000000,0.619718,1.0,0.0,1.0,0.0
1883,0,0.237389,0.065436,0.021678,0.000000,0.380282,1.0,0.0,1.0,0.0
1884,0,0.346192,0.059770,0.010137,0.000000,0.647887,1.0,0.0,1.0,0.0
1885,0,0.415430,0.037105,0.005214,0.000000,0.197183,1.0,0.0,1.0,0.0


## 정형화 단계

In [33]:
from sklearn.model_selection import train_test_split
target = result['재발여부']
features = result.drop('재발여부', axis=1)
train_features, test_features, train_target, test_target = train_test_split(features, target)
train_features.shape, test_features.shape, train_target.shape, test_target.shape

((1415, 9), (472, 9), (1415,), (472,))

## 모델

In [34]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model

In [35]:
model.fit(train_features, train_target)

In [36]:
train_target_predict=model.predict(train_features)
train_target.shape , train_target_predict.shape

((1415,), (1415,))

In [37]:
test_target_predict=model.predict(test_features)
test_target.shape , test_target_predict.shape

((472,), (472,))

## 평가 수치

### accuracy_score: 정확도

In [38]:
from sklearn.metrics import accuracy_score
accuracy_score(train_target, train_target_predict) # train - 정확도 높음

0.8812720848056537

In [39]:
accuracy_score(test_target, test_target_predict) # test - 정확도 높음

0.9088983050847458

### F1 score: 정밀도/재현율

In [40]:
from sklearn.metrics import classification_report
print(classification_report(train_target, train_target_predict)) # macro avg / weighted avg의 간극이 큼

              precision    recall  f1-score   support

           0       0.88      1.00      0.94      1237
           1       1.00      0.06      0.11       178

    accuracy                           0.88      1415
   macro avg       0.94      0.53      0.52      1415
weighted avg       0.90      0.88      0.83      1415



In [41]:
print(classification_report(test_target, test_target_predict)) # macro avg / weighted avg의 간극이 큼

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       423
           1       1.00      0.12      0.22        49

    accuracy                           0.91       472
   macro avg       0.95      0.56      0.58       472
weighted avg       0.92      0.91      0.88       472

