In [3]:
#모듈 설치
!pip install pandas
!pip install numpy
!pip install scikit-learn==0.24.1
!pip install xgboost==1.4.2
!pip install lightgbm==3.2.1

In [6]:
import pandas as pd
import numpy as np
np.random.seed(1000)

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from scipy import stats

#데이터 불러오기
train_data=pd.read_csv('train.csv',encoding='UTF-8')
test_data=pd.read_csv('test.csv',encoding='UTF-8')

In [2]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89619 entries, 0 to 89618
Data columns (total 23 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   신고번호      89619 non-null  int64  
 1   신고일자      89619 non-null  object 
 2   통관지세관부호   89619 non-null  int64  
 3   신고인부호     89619 non-null  object 
 4   수입자부호     89619 non-null  object 
 5   해외거래처부호   62966 non-null  object 
 6   특송업체부호    29787 non-null  object 
 7   수입통관계획코드  89619 non-null  object 
 8   수입신고구분코드  89619 non-null  object 
 9   수입거래구분코드  89619 non-null  int64  
 10  수입종류코드    89619 non-null  int64  
 11  징수형태코드    89619 non-null  int64  
 12  신고중량(KG)  89619 non-null  float64
 13  과세가격원화금액  89619 non-null  float64
 14  운송수단유형코드  89619 non-null  int64  
 15  반입보세구역부호  89619 non-null  int64  
 16  HS10단위부호  89619 non-null  int64  
 17  적출국가코드    89619 non-null  object 
 18  원산지국가코드   89619 non-null  object 
 19  관세율구분코드   89619 non-null  object 
 20  관세율       89619 non-null  fl

In [3]:
#데이터 확인
train_data.head()

Unnamed: 0,신고번호,신고일자,통관지세관부호,신고인부호,수입자부호,해외거래처부호,특송업체부호,수입통관계획코드,수입신고구분코드,수입거래구분코드,...,과세가격원화금액,운송수단유형코드,반입보세구역부호,HS10단위부호,적출국가코드,원산지국가코드,관세율구분코드,관세율,우범여부,핵심적발
0,37453,2020-01-01,40,10UUA,435E04J,CFLCEFM,,C,A,15,...,12078.12,40,4077180,4202999000,CN,CN,A,8.0,0,0
1,150339,2020-01-01,20,7E3BD,1NTJ7F6,,,C,B,15,...,593868.8,10,4002001,9503003919,CN,CN,A,8.0,0,0
2,55710,2020-01-01,10,QHZ00,DGHPNRA,ZEXKR7K,8W5SEL,F,B,94,...,62384.9,40,15002001,8708290000,CN,CN,FCN1,3.2,1,2
3,413154,2020-01-01,20,SZIVC,G85A4OI,WD62ULK,0X6YQV,C,B,15,...,4321373.0,10,4002001,2933699099,CN,CN,FCN1,0.0,0,0
4,223511,2020-01-01,40,BU7II,9I00BFP,Q22MMTW,,F,B,15,...,1212105.0,10,2002079,6205200000,CN,CN,A,13.0,0,0


# 데이터 전처리

In [4]:
##### 변주형변수, 연속형변수 분리 #####
범주형 = ['통관지세관부호','수입통관계획코드','해외거래처부호','특송업체부호','수입신고구분코드', '수입거래구분코드', '수입종류코드',
                    '징수형태코드', '운송수단유형코드','반입보세구역부호', 'HS10단위부호', '적출국가코드', '원산지국가코드',
                    '관세율구분코드', '관세율']
연속형 = ['신고중량(KG)', '과세가격원화금액']

#레이블인코딩 함수 정의
enc_classes = {}
def encoding_label(x): 
    le = LabelEncoder()
    le.fit(x)
    label = le.transform(x)
    enc_classes[x.name] = le.classes_ # x.name: 컬럼명
    return label

##### 트레인셋, 테스트셋 합침(우범,핵심여부 제외) #####
train_df = train_data.loc[:, '통관지세관부호':'관세율']
test_df = test_data.loc[:, '통관지세관부호':'관세율']
df = pd.concat([train_df,test_df])

#확인
df

Unnamed: 0,통관지세관부호,신고인부호,수입자부호,해외거래처부호,특송업체부호,수입통관계획코드,수입신고구분코드,수입거래구분코드,수입종류코드,징수형태코드,신고중량(KG),과세가격원화금액,운송수단유형코드,반입보세구역부호,HS10단위부호,적출국가코드,원산지국가코드,관세율구분코드,관세율
0,40,10UUA,435E04J,CFLCEFM,,C,A,15,21,11,4181.6,1.207812e+04,40,4077180,4202999000,CN,CN,A,8.0
1,20,7E3BD,1NTJ7F6,,,C,B,15,21,11,7977.0,5.938688e+05,10,4002001,9503003919,CN,CN,A,8.0
2,10,QHZ00,DGHPNRA,ZEXKR7K,8W5SEL,F,B,94,21,11,246.0,6.238490e+04,40,15002001,8708290000,CN,CN,FCN1,3.2
3,20,SZIVC,G85A4OI,WD62ULK,0X6YQV,C,B,15,21,11,4435.3,4.321373e+06,10,4002001,2933699099,CN,CN,FCN1,0.0
4,40,BU7II,9I00BFP,Q22MMTW,,F,B,15,21,11,4564.4,1.212105e+06,10,2002079,6205200000,CN,CN,A,13.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10268,53,10UUA,6HP77AR,KH8FYHX,YNRX1F,E,D,29,26,21,3063.2,2.843916e+04,40,3012312,9503003411,CN,CN,FCN1,3.2
10269,20,0384L,2XRUBK3,F48KA2D,WVTM1M,D,B,86,28,11,1595.7,3.654290e+06,10,2077024,2933199099,CN,CN,FCN1,2.6
10270,40,UV0TR,HMP5A9V,I1A4TDY,,Z,B,11,21,14,2432.3,2.926935e+06,40,3902044,7222200000,US,US,C,0.0
10271,30,FO4OQ,D1NL5YO,0JAQPEX,,C,B,11,21,43,8484.3,1.534544e+05,10,4077116,9401799000,CN,CN,C,0.0


In [5]:
##### 데이터 전처리 #####

# 해외거래처부호,특송업체부호 전처리 #
df.해외거래처부호=df.해외거래처부호.fillna('NO')
df.특송업체부호=df.특송업체부호.fillna('NO')

###########주의사항 이 코드 똑같이 2번 돌리면 에러뜸 ##############
# 반입보세구역부호 전처리 #
area_code=df['반입보세구역부호']
Area_code=[]
for i in area_code:
    if(int(i/1000000)<10):
        string_number='0'+str(i)
        string_number=string_number[0:3]
        Area_code.append(string_number)
    elif(int(i/10000000)<10):
        co=str(i)
        a=co[0:3]
        Area_code.append(a)
df.drop('반입보세구역부호',axis=1,inplace=True)
df['반입보세구역부호'] = pd.Series(Area_code, index=df.index)

# HS10단위부호 전처리 #
HS_code=df['HS10단위부호']
HS_code=list(HS_code)
Hs_code=[]
for i in HS_code:
    Hs_code.append(int(i/1000000))

df.drop('HS10단위부호',axis=1,inplace=True)
df['HS10단위부호'] = pd.Series(Hs_code, index=df.index)


##### 범주형 변수에 레이블 인코딩 적용 ######
category_df = df[범주형].apply(encoding_label)

category_df

Unnamed: 0,통관지세관부호,수입통관계획코드,해외거래처부호,특송업체부호,수입신고구분코드,수입거래구분코드,수입종류코드,징수형태코드,운송수단유형코드,반입보세구역부호,HS10단위부호,적출국가코드,원산지국가코드,관세율구분코드,관세율
0,13,1,1591,63,0,3,4,2,3,14,287,15,19,0,37
1,6,1,3033,63,1,3,4,2,0,14,737,15,19,0,37
2,0,4,4589,28,1,23,4,2,3,35,674,15,19,13,15
3,6,1,4211,2,1,3,4,2,0,14,180,15,19,13,0
4,13,4,3386,63,1,3,4,2,0,6,402,15,19,0,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10268,16,3,2603,85,2,6,6,7,3,9,737,15,19,13,15
10269,6,2,1920,80,1,16,7,2,0,6,180,15,19,13,13
10270,13,6,2290,63,1,0,4,5,3,13,492,81,95,1,0
10271,9,1,68,63,1,0,4,10,0,14,731,15,19,1,0


In [6]:
### 최종적으로 사용할 데이터(데이터 분리) ###

category_df = category_df.reset_index(drop=False)

df_train = category_df.loc[:89618,]
df_train = df_train.loc[:,'통관지세관부호':]

df_test = category_df.loc[89619:,]
df_test = df_test.loc[:,'통관지세관부호':]

df_test

Unnamed: 0,통관지세관부호,수입통관계획코드,해외거래처부호,특송업체부호,수입신고구분코드,수입거래구분코드,수입종류코드,징수형태코드,운송수단유형코드,반입보세구역부호,HS10단위부호,적출국가코드,원산지국가코드,관세율구분코드,관세율
89619,6,4,100,63,1,3,4,2,3,9,34,81,95,0,64
89620,13,3,3033,63,1,23,4,2,3,14,760,81,95,0,0
89621,13,1,3033,63,1,0,4,2,3,14,99,81,95,0,37
89622,4,1,3033,63,1,0,4,2,3,6,58,81,95,0,14
89623,0,1,3033,63,1,0,4,2,3,0,676,83,97,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99887,16,3,2603,85,2,6,6,7,3,9,737,15,19,13,15
99888,6,2,1920,80,1,16,7,2,0,6,180,15,19,13,13
99889,13,6,2290,63,1,0,4,5,3,13,492,81,95,1,0
99890,9,1,68,63,1,0,4,10,0,14,731,15,19,1,0


In [7]:
### 인덱스 재정렬 ###
df_test = df_test.reset_index(drop=False).loc[:,'통관지세관부호':]
df_test

Unnamed: 0,통관지세관부호,수입통관계획코드,해외거래처부호,특송업체부호,수입신고구분코드,수입거래구분코드,수입종류코드,징수형태코드,운송수단유형코드,반입보세구역부호,HS10단위부호,적출국가코드,원산지국가코드,관세율구분코드,관세율
0,6,4,100,63,1,3,4,2,3,9,34,81,95,0,64
1,13,3,3033,63,1,23,4,2,3,14,760,81,95,0,0
2,13,1,3033,63,1,0,4,2,3,14,99,81,95,0,37
3,4,1,3033,63,1,0,4,2,3,6,58,81,95,0,14
4,0,1,3033,63,1,0,4,2,3,0,676,83,97,29,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10268,16,3,2603,85,2,6,6,7,3,9,737,15,19,13,15
10269,6,2,1920,80,1,16,7,2,0,6,180,15,19,13,13
10270,13,6,2290,63,1,0,4,5,3,13,492,81,95,1,0
10271,9,1,68,63,1,0,4,10,0,14,731,15,19,1,0


In [8]:
df_train

Unnamed: 0,통관지세관부호,수입통관계획코드,해외거래처부호,특송업체부호,수입신고구분코드,수입거래구분코드,수입종류코드,징수형태코드,운송수단유형코드,반입보세구역부호,HS10단위부호,적출국가코드,원산지국가코드,관세율구분코드,관세율
0,13,1,1591,63,0,3,4,2,3,14,287,15,19,0,37
1,6,1,3033,63,1,3,4,2,0,14,737,15,19,0,37
2,0,4,4589,28,1,23,4,2,3,35,674,15,19,13,15
3,6,1,4211,2,1,3,4,2,0,14,180,15,19,13,0
4,13,4,3386,63,1,3,4,2,0,6,402,15,19,0,46
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89614,6,3,680,16,1,0,4,5,0,6,216,81,95,1,31
89615,13,1,127,16,1,3,4,2,0,6,556,15,19,13,0
89616,6,2,2270,76,1,3,4,2,3,15,211,81,95,1,31
89617,4,2,4158,76,1,0,4,2,0,14,270,31,19,1,31


In [9]:
### 연속형 변수 붙이기 ###
train_cont_df = train_df[연속형]
test_cont_df = test_df[연속형]

############ 최종적으로 사용할 변수임!!!! ################
###### data를 이용해서 트레인셋, 테스트셋 분리하고 #######
###### 제일 마지막에 data_test로 예측하는 것임 ###########
data = df_train.join(train_cont_df)
data_test = df_test.join(test_cont_df)

### 마지막 전처리 이상치제거 - 학습할 데이터만 !! ###
data['z값']=stats.zscore(data['신고중량(KG)'])
data=data[data['z값'].between(-2,2)]
data=data.drop(['z값'],axis=1)

train_data['z값']=stats.zscore(train_data['신고중량(KG)'])
train_data=train_data[train_data['z값'].between(-2,2)]
train_data=train_data.drop(['z값'],axis=1)

In [10]:
### 확인용 ###
data

Unnamed: 0,통관지세관부호,수입통관계획코드,해외거래처부호,특송업체부호,수입신고구분코드,수입거래구분코드,수입종류코드,징수형태코드,운송수단유형코드,반입보세구역부호,HS10단위부호,적출국가코드,원산지국가코드,관세율구분코드,관세율,신고중량(KG),과세가격원화금액
0,13,1,1591,63,0,3,4,2,3,14,287,15,19,0,37,4181.6,1.207812e+04
1,6,1,3033,63,1,3,4,2,0,14,737,15,19,0,37,7977.0,5.938688e+05
2,0,4,4589,28,1,23,4,2,3,35,674,15,19,13,15,246.0,6.238490e+04
3,6,1,4211,2,1,3,4,2,0,14,180,15,19,13,0,4435.3,4.321373e+06
4,13,4,3386,63,1,3,4,2,0,6,402,15,19,0,46,4564.4,1.212105e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89614,6,3,680,16,1,0,4,5,0,6,216,81,95,1,31,4092.0,6.750686e+05
89615,13,1,127,16,1,3,4,2,0,6,556,15,19,13,0,5098.0,7.772372e+05
89616,6,2,2270,76,1,3,4,2,3,15,211,81,95,1,31,4728.2,7.519390e+05
89617,4,2,4158,76,1,0,4,2,0,14,270,31,19,1,31,5236.2,9.952920e+03


In [11]:
### 확인용 ###
data_test

Unnamed: 0,통관지세관부호,수입통관계획코드,해외거래처부호,특송업체부호,수입신고구분코드,수입거래구분코드,수입종류코드,징수형태코드,운송수단유형코드,반입보세구역부호,HS10단위부호,적출국가코드,원산지국가코드,관세율구분코드,관세율,신고중량(KG),과세가격원화금액
0,6,4,100,63,1,3,4,2,3,9,34,81,95,0,64,3731.9,2.755287e+06
1,13,3,3033,63,1,23,4,2,3,14,760,81,95,0,0,6406.5,8.517864e+05
2,13,1,3033,63,1,0,4,2,3,14,99,81,95,0,37,5824.9,6.532926e+04
3,4,1,3033,63,1,0,4,2,3,6,58,81,95,0,14,3798.3,1.028587e+06
4,0,1,3033,63,1,0,4,2,3,0,676,83,97,29,0,3795.7,1.677178e+06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10268,16,3,2603,85,2,6,6,7,3,9,737,15,19,13,15,3063.2,2.843916e+04
10269,6,2,1920,80,1,16,7,2,0,6,180,15,19,13,13,1595.7,3.654290e+06
10270,13,6,2290,63,1,0,4,5,3,13,492,81,95,1,0,2432.3,2.926935e+06
10271,9,1,68,63,1,0,4,10,0,14,731,15,19,1,0,8484.3,1.534544e+05


# 전처리 완료 모델 학습 진행

In [12]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import xgboost as xgb
from lightgbm import LGBMClassifier

from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

# 우범화물(데이터 학습)

In [13]:
X=data
y=train_data['우범여부']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                     random_state=1000)

In [14]:
lgbm_wrapper = LGBMClassifier(random_state=1000,n_estimators=800,num_leaves=10, learning_rate=0.05, max_depth=5, min_child_samples=40,boost_from_average=False)

evals = [(X_test, y_test)]
lgbm_wrapper.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="logloss", eval_set=evals, verbose=True)

preds2 = lgbm_wrapper.predict(X_test)
lgbm_pred_proba = lgbm_wrapper.predict_proba(X_test)[:, 1]

[1]	valid_0's binary_logloss: 0.673209
Training until validation scores don't improve for 100 rounds
[2]	valid_0's binary_logloss: 0.655169
[3]	valid_0's binary_logloss: 0.638786
[4]	valid_0's binary_logloss: 0.623866
[5]	valid_0's binary_logloss: 0.610269
[6]	valid_0's binary_logloss: 0.597663
[7]	valid_0's binary_logloss: 0.586265
[8]	valid_0's binary_logloss: 0.575792
[9]	valid_0's binary_logloss: 0.565885
[10]	valid_0's binary_logloss: 0.556868
[11]	valid_0's binary_logloss: 0.548684
[12]	valid_0's binary_logloss: 0.540886
[13]	valid_0's binary_logloss: 0.533888
[14]	valid_0's binary_logloss: 0.527221
[15]	valid_0's binary_logloss: 0.521244
[16]	valid_0's binary_logloss: 0.515521
[17]	valid_0's binary_logloss: 0.510221
[18]	valid_0's binary_logloss: 0.505255
[19]	valid_0's binary_logloss: 0.500808
[20]	valid_0's binary_logloss: 0.496462
[21]	valid_0's binary_logloss: 0.492469
[22]	valid_0's binary_logloss: 0.488862
[23]	valid_0's binary_logloss: 0.484939
[24]	valid_0's binary_loglo

[205]	valid_0's binary_logloss: 0.411194
[206]	valid_0's binary_logloss: 0.411161
[207]	valid_0's binary_logloss: 0.411114
[208]	valid_0's binary_logloss: 0.411076
[209]	valid_0's binary_logloss: 0.411063
[210]	valid_0's binary_logloss: 0.411061
[211]	valid_0's binary_logloss: 0.411004
[212]	valid_0's binary_logloss: 0.411008
[213]	valid_0's binary_logloss: 0.41101
[214]	valid_0's binary_logloss: 0.411
[215]	valid_0's binary_logloss: 0.410974
[216]	valid_0's binary_logloss: 0.410945
[217]	valid_0's binary_logloss: 0.410949
[218]	valid_0's binary_logloss: 0.410927
[219]	valid_0's binary_logloss: 0.410899
[220]	valid_0's binary_logloss: 0.410931
[221]	valid_0's binary_logloss: 0.4109
[222]	valid_0's binary_logloss: 0.410898
[223]	valid_0's binary_logloss: 0.410891
[224]	valid_0's binary_logloss: 0.41084
[225]	valid_0's binary_logloss: 0.410835
[226]	valid_0's binary_logloss: 0.410833
[227]	valid_0's binary_logloss: 0.410829
[228]	valid_0's binary_logloss: 0.410817
[229]	valid_0's binary_

[427]	valid_0's binary_logloss: 0.409385
[428]	valid_0's binary_logloss: 0.409381
[429]	valid_0's binary_logloss: 0.409373
[430]	valid_0's binary_logloss: 0.409363
[431]	valid_0's binary_logloss: 0.409364
[432]	valid_0's binary_logloss: 0.409362
[433]	valid_0's binary_logloss: 0.409355
[434]	valid_0's binary_logloss: 0.409345
[435]	valid_0's binary_logloss: 0.409338
[436]	valid_0's binary_logloss: 0.409328
[437]	valid_0's binary_logloss: 0.409331
[438]	valid_0's binary_logloss: 0.409336
[439]	valid_0's binary_logloss: 0.409325
[440]	valid_0's binary_logloss: 0.409319
[441]	valid_0's binary_logloss: 0.409324
[442]	valid_0's binary_logloss: 0.409322
[443]	valid_0's binary_logloss: 0.40932
[444]	valid_0's binary_logloss: 0.409332
[445]	valid_0's binary_logloss: 0.409324
[446]	valid_0's binary_logloss: 0.409314
[447]	valid_0's binary_logloss: 0.409326
[448]	valid_0's binary_logloss: 0.409341
[449]	valid_0's binary_logloss: 0.409305
[450]	valid_0's binary_logloss: 0.409289
[451]	valid_0's b

[634]	valid_0's binary_logloss: 0.409011
[635]	valid_0's binary_logloss: 0.409025
[636]	valid_0's binary_logloss: 0.409034
[637]	valid_0's binary_logloss: 0.409032
[638]	valid_0's binary_logloss: 0.409023
[639]	valid_0's binary_logloss: 0.409021
[640]	valid_0's binary_logloss: 0.40902
[641]	valid_0's binary_logloss: 0.409016
[642]	valid_0's binary_logloss: 0.409033
[643]	valid_0's binary_logloss: 0.409029
[644]	valid_0's binary_logloss: 0.409037
[645]	valid_0's binary_logloss: 0.409043
[646]	valid_0's binary_logloss: 0.409034
[647]	valid_0's binary_logloss: 0.409041
[648]	valid_0's binary_logloss: 0.409043
[649]	valid_0's binary_logloss: 0.409047
[650]	valid_0's binary_logloss: 0.409025
[651]	valid_0's binary_logloss: 0.409021
[652]	valid_0's binary_logloss: 0.409015
[653]	valid_0's binary_logloss: 0.409003
[654]	valid_0's binary_logloss: 0.409013
[655]	valid_0's binary_logloss: 0.409006
[656]	valid_0's binary_logloss: 0.409008
[657]	valid_0's binary_logloss: 0.408997
[658]	valid_0's b

In [15]:
print(lgbm_pred_proba[:30])
print(preds2[:30])

[0.70554831 0.14017388 0.0446848  0.23544447 0.15238139 0.48769656
 0.03255247 0.21805158 0.34835772 0.46469868 0.03356692 0.08579282
 0.3088973  0.12708128 0.24105806 0.0825426  0.13099319 0.33181245
 0.20571435 0.07031026 0.01871669 0.01838624 0.05264    0.01215355
 0.33156505 0.19537804 0.16506069 0.04931082 0.45927366 0.06045014]
[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [16]:
preds = [ 1 if x > 0.26 else 0 for x in lgbm_pred_proba]
print(preds[:30])

[1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0]


In [17]:
confusion = confusion_matrix(y_test, preds)
accuracy = accuracy_score(y_test, preds)
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)
#roc_auc = roc_auc_score(y_test, pred_proba)

print(confusion)
print('정확도: {0:.4f}, 정밀도: {1:.4f}, 재현율: {2:.4f}, F1: {3:.4f}'.format(accuracy, precision, recall, f1))

[[10255  3424]
 [ 1002  2751]]
정확도: 0.7461, 정밀도: 0.4455, 재현율: 0.7330, F1: 0.5542


# 우범화물 테스트데이터 예측(제출용)

In [18]:
# 테스트데이터 예측값 추출 #
pred_data_test = lgbm_wrapper.predict_proba(data_test)[:, 1]
pred_data_test

array([0.27280346, 0.08043896, 0.010364  , ..., 0.47899908, 0.19287385,
       0.46891582])

In [19]:
# 분류 기준 정하기 #
pred_data_test = [ 1 if x > 0.26 else 0 for x in pred_data_test]
pred_data_test[:20]

[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0]

# 핵심적발(데이터 학습-시간 좀 걸림 이거 기다렸다가 아래 돌려)

In [20]:
### XGBoost 하는중.......####
### 파라미터 ###

y=train_data['핵심적발']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                     random_state=1000)


dtrain = xgb.DMatrix(data=X_train, label=y_train)
dtest = xgb.DMatrix(data=X_test, label=y_test)

params = {
    'max_depth' : 3,
    'eta' : 0.1,
    'objective' : 'multi:softprob',
    'num_class':3,
    'eval_metric' : 'mlogloss'
}
num_rounds=400

wlist = [(dtrain, 'train'), (dtest, 'eval')]
xgb_model = xgb.train(params=params, dtrain=dtrain, num_boost_round=num_rounds,
                     early_stopping_rounds=100, evals=wlist)

[0]	train-mlogloss:1.02640	eval-mlogloss:1.02640
[1]	train-mlogloss:0.96627	eval-mlogloss:0.96630
[2]	train-mlogloss:0.91556	eval-mlogloss:0.91564
[3]	train-mlogloss:0.87264	eval-mlogloss:0.87275
[4]	train-mlogloss:0.83585	eval-mlogloss:0.83592
[5]	train-mlogloss:0.80442	eval-mlogloss:0.80439
[6]	train-mlogloss:0.77732	eval-mlogloss:0.77726
[7]	train-mlogloss:0.75385	eval-mlogloss:0.75371
[8]	train-mlogloss:0.73364	eval-mlogloss:0.73344
[9]	train-mlogloss:0.71554	eval-mlogloss:0.71528
[10]	train-mlogloss:0.70018	eval-mlogloss:0.69979
[11]	train-mlogloss:0.68628	eval-mlogloss:0.68579
[12]	train-mlogloss:0.67456	eval-mlogloss:0.67393
[13]	train-mlogloss:0.66442	eval-mlogloss:0.66371
[14]	train-mlogloss:0.65484	eval-mlogloss:0.65405
[15]	train-mlogloss:0.64703	eval-mlogloss:0.64610
[16]	train-mlogloss:0.64021	eval-mlogloss:0.63913
[17]	train-mlogloss:0.63375	eval-mlogloss:0.63272
[18]	train-mlogloss:0.62788	eval-mlogloss:0.62688
[19]	train-mlogloss:0.62305	eval-mlogloss:0.62192
[20]	train

[163]	train-mlogloss:0.55519	eval-mlogloss:0.56166
[164]	train-mlogloss:0.55510	eval-mlogloss:0.56165
[165]	train-mlogloss:0.55500	eval-mlogloss:0.56158
[166]	train-mlogloss:0.55491	eval-mlogloss:0.56152
[167]	train-mlogloss:0.55483	eval-mlogloss:0.56149
[168]	train-mlogloss:0.55475	eval-mlogloss:0.56150
[169]	train-mlogloss:0.55470	eval-mlogloss:0.56149
[170]	train-mlogloss:0.55462	eval-mlogloss:0.56147
[171]	train-mlogloss:0.55456	eval-mlogloss:0.56145
[172]	train-mlogloss:0.55448	eval-mlogloss:0.56142
[173]	train-mlogloss:0.55443	eval-mlogloss:0.56144
[174]	train-mlogloss:0.55436	eval-mlogloss:0.56142
[175]	train-mlogloss:0.55428	eval-mlogloss:0.56137
[176]	train-mlogloss:0.55422	eval-mlogloss:0.56136
[177]	train-mlogloss:0.55416	eval-mlogloss:0.56133
[178]	train-mlogloss:0.55409	eval-mlogloss:0.56132
[179]	train-mlogloss:0.55402	eval-mlogloss:0.56132
[180]	train-mlogloss:0.55396	eval-mlogloss:0.56133
[181]	train-mlogloss:0.55389	eval-mlogloss:0.56131
[182]	train-mlogloss:0.55384	ev

[324]	train-mlogloss:0.54505	eval-mlogloss:0.56007
[325]	train-mlogloss:0.54499	eval-mlogloss:0.56005
[326]	train-mlogloss:0.54492	eval-mlogloss:0.56004
[327]	train-mlogloss:0.54486	eval-mlogloss:0.56002
[328]	train-mlogloss:0.54482	eval-mlogloss:0.56004
[329]	train-mlogloss:0.54476	eval-mlogloss:0.56005
[330]	train-mlogloss:0.54471	eval-mlogloss:0.56005
[331]	train-mlogloss:0.54467	eval-mlogloss:0.56006
[332]	train-mlogloss:0.54462	eval-mlogloss:0.56008
[333]	train-mlogloss:0.54456	eval-mlogloss:0.56010
[334]	train-mlogloss:0.54450	eval-mlogloss:0.56009
[335]	train-mlogloss:0.54445	eval-mlogloss:0.56009
[336]	train-mlogloss:0.54442	eval-mlogloss:0.56009
[337]	train-mlogloss:0.54436	eval-mlogloss:0.56009
[338]	train-mlogloss:0.54431	eval-mlogloss:0.56010
[339]	train-mlogloss:0.54426	eval-mlogloss:0.56009
[340]	train-mlogloss:0.54422	eval-mlogloss:0.56009
[341]	train-mlogloss:0.54416	eval-mlogloss:0.56010
[342]	train-mlogloss:0.54407	eval-mlogloss:0.56007
[343]	train-mlogloss:0.54400	ev

In [21]:
pred_probs = xgb_model.predict(dtest)
print('predict() 수행 결과값 10개 표시, 예측 확률값로 표시된다.')
print(np.round(pred_probs[:10], 3))

predict() 수행 결과값 10개 표시, 예측 확률값로 표시된다.
[[0.176 0.289 0.535]
 [0.874 0.068 0.059]
 [0.958 0.026 0.015]
 [0.769 0.118 0.113]
 [0.852 0.074 0.074]
 [0.506 0.24  0.255]
 [0.96  0.024 0.016]
 [0.771 0.112 0.117]
 [0.63  0.203 0.167]
 [0.553 0.205 0.242]]


In [22]:
pred=[]
for x in pred_probs:
    if (x[1]+x[2]<0.28):
        pred.append(0)
    elif (x[2]>0.25):
        pred.append(2)
    else:
        pred.append(1)
        
pred
pred=pd.DataFrame(pred)

In [23]:
print("F1",f1_score(y_test,pred,average='macro'))

F1 0.4733843923004462


# 핵심적발 테스트데이터 예측(제출용)

In [24]:
data_test_핵심_DM = xgb.DMatrix(data=data_test)

In [25]:
pred_data_test_핵심 = xgb_model.predict(data_test_핵심_DM)
pred_data_test_핵심

array([[0.75022   , 0.14153579, 0.10824423],
       [0.9382437 , 0.04130615, 0.02045017],
       [0.9863592 , 0.00741659, 0.00622422],
       ...,
       [0.55988306, 0.2197143 , 0.22040267],
       [0.8314261 , 0.06627096, 0.10230298],
       [0.48605558, 0.262658  , 0.25128645]], dtype=float32)

In [26]:
pred_data_test_핵심적발 = []
for x in pred_data_test_핵심:
    if (x[1]+x[2]<0.28):
        pred_data_test_핵심적발.append(0)
    elif (x[2]>0.25):
        pred_data_test_핵심적발.append(2)
    else:
        pred_data_test_핵심적발.append(1)
pred_data_test_핵심적발[:20]

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 1, 0, 0, 1, 0, 2, 0]

# 제출 파일 만들기

In [27]:
a = np.array([pred_data_test, pred_data_test_핵심적발])

In [28]:
pred_df = pd.DataFrame({'우범여부':a[0], '핵심적발':a[1]})
pred_df


Unnamed: 0,우범여부,핵심적발
0,1,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
10268,1,2
10269,1,2
10270,1,1
10271,0,0


In [29]:
dfa = pd.DataFrame(test_data['신고번호'])
final_df = dfa.join(pred_df)
final_df

Unnamed: 0,신고번호,우범여부,핵심적발
0,982834,1,0
1,828961,0,0
2,522066,0,0
3,999547,0,0
4,919320,0,0
...,...,...,...
10268,857321,1,2
10269,621501,1,2
10270,998367,1,1
10271,697376,0,0


In [30]:
# 파일 저장 #
final_df.to_csv('숭파고.csv', index=None, encoding='utf-8')