## 모델 학습

In [1]:
%reload_ext watermark
%watermark -v -p pandas,numpy,sklearn,lightgbm,joblib,tqdm

Python implementation: CPython
Python version       : 3.8.10
IPython version      : 7.34.0

pandas  : 1.4.1
numpy   : 1.22.3
sklearn : 1.0.2
lightgbm: 3.3.2
joblib  : 1.1.0
tqdm    : 4.63.1



In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import glob,os,joblib,tqdm,datetime
print(datetime.datetime.today())

2022-12-18 09:28:37.989957


In [3]:
def fun(path):
    select_cols=['path','LA','LP','Left','RA','RP','Right']
    file=glob.glob(f"{path}/*.tsv")
    if len(file)!=0:
        file=file[0]
        try:
            feature_df=pd.read_csv(file,sep='\t')
            feature=feature_df[['Direction','Gain']].groupby(
                ['Direction']).mean().T
            feature[list(set(select_cols[1:])-set(feature.columns))]=np.nan
            feature['path']=file
            return feature[select_cols]
        except:
            feature=pd.DataFrame([],columns=select_cols)
            feature.loc[0,:]=np.nan
            return feature
    else:
        feature=pd.DataFrame([],columns=select_cols)
        feature.loc[0,:]=np.nan
        return feature
    
def get_feature(file,model_type):
    df=pd.read_csv(file,sep='\t')
    df['diagnosis_group']=file.split('/')[-1].split('.')[0]
    df['patient_paths']=[
        glob.glob(f"/home/data/{model_type}/1.원천데이터/**/{i}/")[0]
        for i in df.patient_id.values]
    
    with joblib.Parallel(n_jobs=-2) as parallel:
        df.loc[:,['path','LA','LP','Left','RA','RP','Right']]=\
            pd.concat(
                parallel(joblib.delayed(fun)(i) 
                         for i in tqdm.tqdm(df.patient_paths)),
                axis=0, ignore_index=True)
    gender=pd.get_dummies(df.sex)
    df[gender.columns]=gender
    df=df.loc[:,['age','F',"M",'diagnosis_group','LA',
                 'LP','Left','RA','RP','Right']]
    # df[['LA','LP','Left','RA','RP','Right']]=\
    #     df[['LA','LP','Left','RA','RP','Right']].fillna(
    #     df[['LA','LP','Left','RA','RP','Right']].mean())
    return df
print(datetime.datetime.today())

2022-12-18 09:28:38.002965


In [4]:
print(datetime.datetime.today())
train=pd.concat([get_feature(file,'train') 
    for file in glob.glob("/home/data/train/2.라벨링데이터/*tsv")],
    axis=0,ignore_index=True)

valid=pd.concat([get_feature(file,'valid') 
    for file in glob.glob("/home/data/valid/2.라벨링데이터/*tsv")],
    axis=0,ignore_index=True)
print(datetime.datetime.today())

2022-12-18 09:28:38.010077


100%|██████████| 6275/6275 [00:16<00:00, 385.79it/s]
100%|██████████| 2495/2495 [00:03<00:00, 804.81it/s]
100%|██████████| 6202/6202 [00:08<00:00, 757.45it/s]
100%|██████████| 1094/1094 [00:00<00:00, 1485.11it/s]
100%|██████████| 451/451 [00:00<00:00, 1109.00it/s]
100%|██████████| 1048/1048 [00:01<00:00, 814.55it/s]


2022-12-18 09:29:12.744791


In [5]:
# train 자료 기준 평균값
mean_df=pd.DataFrame({"LA":0.895173,"LP":0.842480,
              "Left":0.885241,"RA":0.909961,
              "RP":0.862435,"Right":[0.942012]}).loc[0]

In [6]:
train.loc[:,mean_df.index]=train.loc[:,mean_df.index].fillna(mean_df)
valid.loc[:,mean_df.index]=valid.loc[:,mean_df.index].fillna(mean_df)

In [7]:
tr_x=train.drop('diagnosis_group',axis=1)
tr_y=(train.diagnosis_group=='Normal').astype('u1')
val_x=valid.drop('diagnosis_group',axis=1)
val_y=(valid.diagnosis_group=='Normal').astype('u1')

In [8]:
print(datetime.datetime.today())
params={"num_leaves":[20,40,60,80,100],
    "min_child_samples":[5,10,15],
    "max_depth":[-1,5,10,2],
    "learning_rate":[0.05,0.1,0.2],
    "reg_alpha":[0,0.01,0.03],
   }
class_weight={0:.4,1:.6}

lgbm=lgb.LGBMClassifier(
    n_estimators=400,random_state=42,class_weight=class_weight)
clf=GridSearchCV(lgbm,params,scoring="accuracy",
                 verbose=1,n_jobs=-1,cv=3)
clf.fit(X=tr_x,y=tr_y)
lgbm.set_params(**clf.best_params_)
lgbm.fit(tr_x, tr_y, early_stopping_rounds=300,
         verbose=0,eval_set=[(val_x,val_y)])
joblib.dump(lgbm,'전정질환군분류.pkl')
print(datetime.datetime.today())

2022-12-18 09:29:12.803437
Fitting 3 folds for each of 540 candidates, totalling 1620 fits




2022-12-18 09:29:59.380871


## 모델 검증

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import lightgbm as lgb
import glob,os,joblib,tqdm,datetime
print(datetime.datetime.today())

2022-12-18 09:29:59.386613


In [10]:
print(datetime.datetime.today())
def fun(path):
    select_cols=['path','LA','LP','Left','RA','RP','Right']
    file=glob.glob(f"{path}/*.tsv")
    if len(file)!=0:
        file=file[0]
        try:
            feature_df=pd.read_csv(file,sep='\t')
            feature=feature_df[['Direction','Gain']].groupby(
                ['Direction']).mean().T
            feature[list(set(select_cols[1:])-set(feature.columns))]=np.nan
            feature['path']=file
            return feature[select_cols]
        except:
            feature=pd.DataFrame([],columns=select_cols)
            feature.loc[0,:]=np.nan
            return feature
    else:
        feature=pd.DataFrame([],columns=select_cols)
        feature.loc[0,:]=np.nan
        return feature
    
def get_feature(file,model_type):
    df=pd.read_csv(file,sep='\t')
    df['diagnosis_group']=file.split('/')[-1].split('.')[0]
    df['patient_paths']=[
        glob.glob(f"/home/data/{model_type}/1.원천데이터/**/{i}/")[0]
        for i in df.patient_id.values]
    
    with joblib.Parallel(n_jobs=-2) as parallel:
        df.loc[:,['path','LA','LP','Left','RA','RP','Right']]=\
            pd.concat(
                parallel(joblib.delayed(fun)(i) 
                         for i in tqdm.tqdm(df.patient_paths)),
                axis=0, ignore_index=True)
    gender=pd.get_dummies(df.sex)
    df[gender.columns]=gender
    df=df.loc[:,['patient_id','age','F',"M",'diagnosis_group','LA',
                 'LP','Left','RA','RP','Right']]
    # df[['LA','LP','Left','RA','RP','Right']]=\
    #     df[['LA','LP','Left','RA','RP','Right']].fillna(
    #     df[['LA','LP','Left','RA','RP','Right']].mean())
    return df
print(datetime.datetime.today())

2022-12-18 09:29:59.396293
2022-12-18 09:29:59.396873


In [11]:
mean_df=pd.DataFrame({"LA":0.895173,"LP":0.842480,
              "Left":0.885241,"RA":0.909961,
              "RP":0.862435,"Right":[0.942012]}).loc[0]

In [12]:
test=pd.concat([get_feature(file,'test') 
    for file in glob.glob("/home/data/test/2.라벨링데이터/*tsv")],
    axis=0,ignore_index=True)
test.loc[:,mean_df.index]=test.loc[:,mean_df.index].fillna(mean_df)
te_x=test.drop(['patient_id','diagnosis_group'],axis=1)
te_y=(test.diagnosis_group=='Normal').astype('u1')

100%|██████████| 1102/1102 [00:01<00:00, 765.36it/s]
100%|██████████| 458/458 [00:00<00:00, 929.01it/s]
100%|██████████| 1053/1053 [00:01<00:00, 688.62it/s]


In [13]:
lgbm=joblib.load('전정질환군분류.pkl')
pred_values=lgbm.predict_proba(te_x)
test['predict_values']=np.array(['abnormal','normal'])[
    np.argmax(pred_values,axis=1)]
print(roc_auc_score(te_y,pred_values[:,1]))
print(datetime.datetime.today())

0.6477889789855602
2022-12-18 09:30:04.859414


In [14]:
test[['patient_id','diagnosis_group','predict_values']].to_csv(
    "전정질환군분류_결과값.tsv",index=False,sep='\t')

In [15]:
test[["abnormal_prob","normal_prob"]]=pred_values
test['label']=(test.diagnosis_group=="Normal").astype('u1')
test[['patient_id','label','abnormal_prob','normal_prob']].to_csv(
    "전정질환군분류_성능평가계산사용값.tsv",index=False,sep='\t')

In [16]:
pd.DataFrame({"AUC":[roc_auc_score(te_y,pred_values[:,1])]}).to_csv(
    "전정질환군분류_AUC.tsv",index=False, sep='\t')

In [17]:
print(datetime.datetime.today())

2022-12-18 09:30:04.904818
