In [1]:
import pandas as pd
import numpy as np
import joblib

In [2]:
df = pd.read_csv('./AIDS_Classification.csv')

In [3]:
df.head(5)

Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820,infected
0,948,2,48,89.8128,0,0,0,100,0,0,...,0,1,0,1,0,422,477,566,324,0
1,1002,3,61,49.4424,0,0,0,90,0,1,...,1,3,0,1,0,162,218,392,564,1
2,961,3,45,88.452,0,1,1,90,0,1,...,1,3,0,1,1,326,274,2063,1893,0
3,1166,3,47,85.2768,0,1,0,100,0,1,...,1,3,0,1,0,287,394,1590,966,0
4,1090,0,43,66.6792,0,1,0,100,0,1,...,1,3,0,0,0,504,353,870,782,0


In [4]:
# time: 실패 또는 검열 시간
# trt: 치료 지표(0 = ZDV만; 1 = ZDV + ddI, 2 = ZDV + Zal, 3 = ddI만)
# age: 기준시 나이(세)
# wtkg : 기준시 체중(kg)
# hemo: 혈우병 (0=아니요, 1=예)
# homo: 동성애 행위 (0=아니요, 1=예)
# drugs: IV 약물 사용 이력(0=아니요, 1=예)
# karnof: Karnofsky 점수(0~100점)
# oprior: 175년 이전 비-ZDV 항레트로바이러스 요법(0=아니요, 1=예)
# z30: 175 이전 30일 동안의 ZDV(0=아니요, 1=예)
# preanti: 175일 이전 항레트로바이러스 요법
# race: 인종(0=백인, 1=비백인)
# gender: 성별(0=F, 1=M)
# str2: 항레트로바이러스 병력(0=경험 없음, 1=경험 있음)
# strat: 항레트로바이러스 병력 계층화(1='항레트로바이러스 나이브',2='> 1이지만 <= 이전 항레트로바이러스 치료 52주',3='> 52주)
# symptom: 증상 지표(0=증상 없음, 1=증상)
# treat: 치료 지표(0=ZDV만, 1=기타)
# offtrt: 96+/-5주 이전의 off-trt 표시(0=아니요, 1=예)
# cd40: 기준선의 CD4
# cd420: 20+/-5주에 CD4
# cd80: 기준선의 CD8
# cd820: 20+/-5주에 CD8
# infected: AIDS에 감염되었습니다(0=아니요, 1=예).

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2139 entries, 0 to 2138
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   time      2139 non-null   int64  
 1   trt       2139 non-null   int64  
 2   age       2139 non-null   int64  
 3   wtkg      2139 non-null   float64
 4   hemo      2139 non-null   int64  
 5   homo      2139 non-null   int64  
 6   drugs     2139 non-null   int64  
 7   karnof    2139 non-null   int64  
 8   oprior    2139 non-null   int64  
 9   z30       2139 non-null   int64  
 10  preanti   2139 non-null   int64  
 11  race      2139 non-null   int64  
 12  gender    2139 non-null   int64  
 13  str2      2139 non-null   int64  
 14  strat     2139 non-null   int64  
 15  symptom   2139 non-null   int64  
 16  treat     2139 non-null   int64  
 17  offtrt    2139 non-null   int64  
 18  cd40      2139 non-null   int64  
 19  cd420     2139 non-null   int64  
 20  cd80      2139 non-null   int6

In [6]:
# 결측치 채우기
df = df.fillna(value={'col1' : 'A'})

# 결측치 있는 행 제거
df = df.dropna()

In [7]:
df.columns

Index(['time', 'trt', 'age', 'wtkg', 'hemo', 'homo', 'drugs', 'karnof',
       'oprior', 'z30', 'preanti', 'race', 'gender', 'str2', 'strat',
       'symptom', 'treat', 'offtrt', 'cd40', 'cd420', 'cd80', 'cd820',
       'infected'],
      dtype='object')

In [8]:
x = df.drop(columns='infected')
y = df['infected']

In [9]:
x

Unnamed: 0,time,trt,age,wtkg,hemo,homo,drugs,karnof,oprior,z30,...,gender,str2,strat,symptom,treat,offtrt,cd40,cd420,cd80,cd820
0,948,2,48,89.8128,0,0,0,100,0,0,...,0,0,1,0,1,0,422,477,566,324
1,1002,3,61,49.4424,0,0,0,90,0,1,...,0,1,3,0,1,0,162,218,392,564
2,961,3,45,88.4520,0,1,1,90,0,1,...,1,1,3,0,1,1,326,274,2063,1893
3,1166,3,47,85.2768,0,1,0,100,0,1,...,1,1,3,0,1,0,287,394,1590,966
4,1090,0,43,66.6792,0,1,0,100,0,1,...,1,1,3,0,0,0,504,353,870,782
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2134,1091,3,21,53.2980,1,0,0,100,0,1,...,1,1,3,0,1,1,152,109,561,720
2135,395,0,17,102.9672,1,0,0,100,0,1,...,1,1,3,0,0,1,373,218,1759,1030
2136,1104,2,53,69.8544,1,1,0,90,0,1,...,1,1,3,0,1,0,419,364,1391,1041
2137,465,0,14,60.0000,1,0,0,100,0,0,...,1,0,1,0,0,0,166,169,999,1838


In [10]:
y

0       0
1       1
2       0
3       0
4       0
       ..
2134    0
2135    0
2136    0
2137    1
2138    0
Name: infected, Length: 2139, dtype: int64

In [11]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y , test_size=0.3, random_state=41)

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
import xgboost as xgb

In [13]:
model_rf = RandomForestClassifier()
model_rf.fit(x_train,y_train)
y_predict_rf = model_rf.predict(x_test)

In [14]:
num = 0
for i in range(len(y_predict_rf)) :
    if y_predict_rf[i] == y_test.values[i] :
        num += 1
print("accuarcy : " + str(num/len(y_predict_rf)))

accuarcy : 0.8925233644859814


In [15]:
model_xgb = xgb.XGBClassifier()
model_xgb.fit(x_train,y_train)
y_predict_xgb = model_xgb.predict(x_test)

In [16]:
num = 0
for i in range(len(y_predict_xgb)) :
    if y_predict_xgb[i] == y_test.values[i] :
        num += 1
print("accuarcy : " + str(num/len(y_predict_xgb)))

accuarcy : 0.8909657320872274


In [17]:
joblib.dump(model_xgb, './ml_model_xgb.h5')

['./ml_model_xgb.h5']