In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import warnings

warnings.filterwarnings(action='ignore') 
know_train = [pd.read_csv(path) for path in sorted(glob('/content/drive/MyDrive/시큐레이어/KNOW job_train/*.csv'))]

In [None]:
for df in know_train:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

## 라벨 인코딩

In [None]:
from sklearn.preprocessing import LabelEncoder
years = ['2017', '2018', '2019', '2020']

year_encoder = {}

for year, df in zip(years, know_train):
    print(year)
    encoders = {}
    
    for col in df.columns:
        if col == 'ID':
            continue
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = LabelEncoder()
            df[col] = df[col].map(str)
            df[col] = encoder.fit_transform(df[col])
            encoders[col] = encoder
            
            
    year_encoder[year] = encoders

2017
2018
2019
2020


In [None]:
train_data = {}
for year, df in zip(years, know_train):
    train_data[year] = {'X': df.iloc[:, 1:-1], # ID제외
                        'y': df.iloc[:, -1]} 

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

clf1 = LogisticRegression()
clf2 = RandomForestClassifier(n_estimators=100, random_state=1)
clf3 = DecisionTreeClassifier()

model = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('dt', clf3)], voting='hard')

models = {}
for year in tqdm(years):
    model = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('dt', clf3)], voting='hard')
    model.fit(train_data[year]['X'].iloc[:, :], train_data[year]['y'])
    models[year] = model

100%|██████████| 4/4 [04:50<00:00, 72.60s/it]


## Testset


In [None]:
know_test = [pd.read_csv(path) for path in sorted(glob('/content/drive/MyDrive/시큐레이어/KNOW job_test/*.csv'))]
know_test[0].head() # 2017년도 test 샘플

Unnamed: 0,idx,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,aq5_2,aq6_1,aq6_2,aq7_1,aq7_2,aq8_1,aq8_2,aq9_1,aq9_2,aq10_1,aq10_2,aq11_1,aq11_2,aq12_1,aq12_2,aq13_1,aq13_2,aq14_1,aq14_2,aq15_1,aq15_2,aq16_1,aq16_2,aq17_1,aq17_2,aq18_1,aq18_2,aq19_1,aq19_2,aq20_1,...,bq18_3,bq18_4,bq18_5,bq18_6,bq18_7,bq19,bq19_1,bq20,bq21,bq22,bq23,bq24_1,bq24_2,bq24_3,bq24_4,bq24_5,bq24_6,bq24_7,bq24_8,bq25,bq26,bq27,bq28,bq29,bq30,bq31,bq32,bq33,bq34,bq35,bq36,bq37,bq38,bq38_1,bq39_1,bq39_2,bq40,bq41_1,bq41_2,bq41_3
0,0,3,4,2,2,3,3,1,,3,5.0,1,,3,4,3,4,3,3,4,5,2,2.0,2,3.0,2,3,4,6.0,4,6,1,,1,,1,,4,5,1,...,3,4,3,2,3,3,다른것으로 대체할 수 없는 업무,2,3,1,0.0,2,2,2,2,2,2,2,2,3,4,3,2,1,없다,컴퓨터,없다,없다,없다,2,2,26,3,비서학,1,1,1,3000,,2300
1,1,5,5,3,5,5,5,5,5.0,4,5.0,4,5.0,5,5,5,5,4,6,4,5,4,5.0,4,4.0,4,5,4,5.0,4,5,4,6.0,5,6.0,4,5.0,5,4,5,...,2,4,3,4,4,2,제조업이 줄어들 것으로 생각되기 때문,2,2,4,30.0,2,2,2,2,2,2,2,2,4,5,4,3,1,없다,"제품검사시스템,PC,엑셀",없다,없다,,3,1,57,4,농화학,1,1,1,5500,,2500
2,2,5,5,5,4,5,4,1,,1,,3,4.0,3,4,3,4,4,4,3,4,5,4.0,5,3.0,4,4,5,6.0,4,5,1,,1,,1,,3,4,1,...,3,3,2,2,3,2,1인 미디어 증가,2,3,1,10.0,1,1,1,1,2,2,2,2,4,5,4,2,2,없다,"오디션(편집프로그램), 나홀로(방송진행장비)",없다,"광선, 홍보 담당자","1인 미디어, 팟캐스트 제작/진행자",3,1,31,4,신문방송,1,1,1,4300,,4000
3,3,4,5,5,6,4,6,3,4.0,4,5.0,4,6.0,4,5,4,6,4,5,4,5,1,,3,4.0,4,5,1,,4,5,4,5.0,4,5.0,2,3.0,4,5,1,...,2,2,2,3,3,3,선호직업 아님,2,3,1,2.0,1,1,1,2,2,2,2,2,4,4,3,4,3,없다,컴퓨터,없다,없다,없다,2,1,35,6,화학,1,1,1,4100,,3000
4,4,5,6,4,5,4,5,1,,1,,1,,3,3,5,6,5,7,3,3,1,,1,,3,3,1,,3,3,1,,1,,1,,4,4,1,...,2,3,3,2,3,3,잡지를 위한 정보나 뉴스는 고갈되지 않기 때문에 꾸준히 유지될 것이다,2,3,1,15.0,2,1,2,2,2,2,2,2,4,3,3,3,3,편집기자,"뉴스, 원고, PC",사진작가,리포터,없다,3,1,36,4,광고홍보,1,1,1,2800,,2000


In [None]:
for df in know_test:
    for col in df.columns:
        df[col].replace(' ', '0', inplace=True)

In [None]:
years = ['2017', '2018', '2019', '2020']

for year, df in zip(years, know_test):
    print(year)
    encoders = {}
    
    for col in df.columns:
        
        try:
            df[col] = df[col].map(int)
        except:
            encoder = year_encoder[year][col]
            df[col] = df[col].map(str)
            category_map = {category: idx for idx, category in enumerate(encoder.classes_)}
            df[col] = df[col].apply(lambda x: category_map[x] if x in category_map else -1) # train set에서 보지못한 카테고리변수 -1(UNK) 처리

2017
2018
2019
2020


### 데이터 처리 및 라벨인코딩 후 

In [None]:
know_train[0]['aq1_1'].value_counts()

3    3116
4    3007
2    1706
5    1072
1     585
Name: aq1_1, dtype: int64

In [None]:
know_test[0].head() # 2017년도 test 샘플

Unnamed: 0,idx,aq1_1,aq1_2,aq2_1,aq2_2,aq3_1,aq3_2,aq4_1,aq4_2,aq5_1,aq5_2,aq6_1,aq6_2,aq7_1,aq7_2,aq8_1,aq8_2,aq9_1,aq9_2,aq10_1,aq10_2,aq11_1,aq11_2,aq12_1,aq12_2,aq13_1,aq13_2,aq14_1,aq14_2,aq15_1,aq15_2,aq16_1,aq16_2,aq17_1,aq17_2,aq18_1,aq18_2,aq19_1,aq19_2,aq20_1,...,bq18_3,bq18_4,bq18_5,bq18_6,bq18_7,bq19,bq19_1,bq20,bq21,bq22,bq23,bq24_1,bq24_2,bq24_3,bq24_4,bq24_5,bq24_6,bq24_7,bq24_8,bq25,bq26,bq27,bq28,bq29,bq30,bq31,bq32,bq33,bq34,bq35,bq36,bq37,bq38,bq38_1,bq39_1,bq39_2,bq40,bq41_1,bq41_2,bq41_3
0,0,3,4,2,2,3,3,1,0,3,5,1,0,3,4,3,4,3,3,4,5,2,2,2,3,2,3,4,6,4,6,1,0,1,0,1,0,4,5,1,...,3,4,3,2,3,3,-1,2,3,1,0,2,2,2,2,2,2,2,2,3,4,3,2,1,717,5356,836,788,186,2,2,26,3,497,1,1,1,3000,0,2300
1,1,5,5,3,5,5,5,5,5,4,5,4,5,5,5,5,5,4,6,4,5,4,5,4,4,4,5,4,5,4,5,4,6,5,6,4,5,5,4,5,...,2,4,3,4,4,2,-1,2,2,4,30,2,2,2,2,2,2,2,2,4,5,4,3,1,717,-1,836,788,0,3,1,57,4,287,1,1,1,5500,0,2500
2,2,5,5,5,4,5,4,1,0,1,0,3,4,3,4,3,4,4,4,3,4,5,4,5,3,4,4,5,6,4,5,1,0,1,0,1,0,3,4,1,...,3,3,2,2,3,2,-1,2,3,1,10,1,1,1,1,2,2,2,2,4,5,4,2,2,717,-1,836,-1,-1,3,1,31,4,705,1,1,1,4300,0,4000
3,3,4,5,5,6,4,6,3,4,4,5,4,6,4,5,4,6,4,5,4,5,1,0,3,4,4,5,1,0,4,5,4,5,4,5,2,3,4,5,1,...,2,2,2,3,3,3,-1,2,3,1,2,1,1,1,2,2,2,2,2,4,4,3,4,3,717,5356,836,788,186,2,1,35,6,1423,1,1,1,4100,0,3000
4,4,5,6,4,5,4,5,1,0,1,0,1,0,3,3,5,6,5,7,3,3,1,0,1,0,3,3,1,0,3,3,1,0,1,0,1,0,4,4,1,...,2,3,3,2,3,3,-1,2,3,1,15,2,1,2,2,2,2,2,2,4,3,3,3,3,-1,-1,-1,402,186,3,1,36,4,141,1,1,1,2800,0,2000


## 테스트셋 추출 및 학습

In [None]:
test_data = {}
for year, df in zip(years, know_test):
    test_data[year] =  {'X': df.iloc[:,1:]}

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier

clf1 = LogisticRegression()
clf2 = RandomForestClassifier(n_estimators=100, random_state=1)
clf3 = DecisionTreeClassifier()

model = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('dt', clf3)], voting='hard')

predicts = []
for year in tqdm(years):
    pred = models[year].predict(test_data[year]['X'])
    predicts.extend(pred)

100%|██████████| 4/4 [00:12<00:00,  3.20s/it]


# 제출

In [None]:
submission = pd.read_csv('sample_submission.csv') # sample submission 불러오기

In [None]:
submission['knowcode'] = predicts
submission.to_csv('know.csv', index=False)