# 필요 패키지 import

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV

from xgboost import plot_importance, XGBClassifier


import warnings
warnings.filterwarnings(action='ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
import os

os.path.abspath('.')

'C:\\Users\\user\\projects\\ml\\dacon\\dacon\\job_care_recommendation'

# 데이터 불러오기

In [3]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

# 날짜 데이터 추가 (월, 일)

In [4]:
train.loc[:, 'month'] = pd.to_datetime(train['contents_open_dt']).dt.month
test.loc[:, 'month'] = pd.to_datetime(test['contents_open_dt']).dt.month

In [5]:
train.loc[:, 'day'] = pd.to_datetime(train['contents_open_dt']).dt.day
test.loc[:, 'day'] = pd.to_datetime(test['contents_open_dt']).dt.day

# 필요 데이터 추출

모델 학습시 id는 제외해야하고

베이스라인에선 contents_open_dt를 제거하고 진행하였습니다

In [6]:
train = train.drop(['id', 'contents_open_dt'], axis=1) 

test = test.drop(['id', 'contents_open_dt'], axis=1)

# 데이터 전처리

## 값이 1개인 변수(column) drop
- class가 하나만 있는 변수 drop

 feature person_prefer_h_3 (0.009)  
20. feature person_attribute_a_1 (0.009)  
21. feature contents_attribute_a (0.005)  
22. feature h_m_match_yn (0.005)  
23. feature person_prefer_c (0.004)  
24. feature person_attribute_b (0.004)  
25. feature h_s_match_yn (0.003)  
26. feature contents_attribute_i (0.002)  
27. feature contents_attribute_k (0.001)  
28. feature person_attribute_a (0.001)  

- person_rn: 사용자 번호  
- contents_rn: 컨텐츠 번호

In [6]:
# value의 종류가 1개이거나, 너무 많은 feartures drop
drop_list = ['person_prefer_f', 'person_prefer_g', 'person_rn', 'contents_rn']

In [7]:
# importance가 낮은 feartures drop
low_importances = ['person_attribute_a_1', 'contents_attribute_a', 'h_m_match_yn', 'person_prefer_c', 'person_attribute_b', 'h_s_match_yn', 'contents_attribute_i', 'contents_attribute_k', 'person_attribute_a']

In [8]:
train.shape

(501951, 37)

In [9]:
test.shape

(46404, 36)

In [10]:
train.drop(drop_list, axis=1, inplace=True)
train.drop(low_importances, axis=1, inplace=True)
test.drop(drop_list, axis=1, inplace=True)
test.drop(low_importances, axis=1, inplace=True)

In [11]:
train.shape

(501951, 24)

In [None]:
test.shape

# 데이터 분리

In [None]:
X_train = train.drop('target', axis=1)
y_train = train.loc[:, 'target']
X_test = test.drop('target', axis=1)
y_test = test.loc[:, 'target']

# 모델 학습

RandomForest를 이용한 학습

## RandomForest 

In [None]:
model = RandomForestClassifier(n_estimators=300, max_depth=6)


model.fit(X_train, y_train)

## xgboost

In [None]:
xgb = XGBClassifier(n_estimators=500, learning_rate=0.1, max_depth=8)

In [None]:
xgb.fit(X_train, y_train)

### 하이퍼파라미터 튜닝

In [None]:
xgb = XGBClassifier()
xgb_param_grid = {
    'n_estimators' : [100, 200, 400, 600],
    'learning_rate' : [0.01, 0.05, 0.1, 0.15],
    'max_depth' : [4, 6, 8, 10, 12],
}

xgb_grid = GridSearchCV(xgb, param_grid=xgb_param_grid, scoring='accuracy', n_jobs= -1, verbose=1)
xgb_grid.fit(X_train, y_train)

## 모델 검증

In [None]:
def get_metrics(y_test, pred):
    print('정확도: ', accuracy_score(y_test, pred))
    print('정확성: ',  precision_score(y_test, pred)) # 정확도
    print('재현율: ', recall_score(y_test, pred)) # 재현율
    print('f1 score: ', f1_score(y_test, pred)) # f1 score

### RandomForest

In [None]:
pred = model.predict(X_test)

get_metrics(y_test, pred)

In [None]:
get_metrics(y_test, pred)

### xgboost

In [None]:
pred = xgb.predict(X_test) 

get_metrics(y_test, pred)

# feature importance

In [None]:
importances = model.feature_importances_

In [None]:
import matplotlib.pyplot as plt

In [None]:
std = np.std([tree.feature_importances_ for tree in model.estimators_], axis=0)
indices = np.argsort(importances)[::-1]

print('Feature Ranking: ')

for f in range(X.shape[1]):
    print("{}. feature {} ({:.3f})".format(f + 1, X.columns[indices][f], importances[indices[f]]))
    
    
plt.figure(figsize =(20, 10))
plt.title('Feature importances')
plt.bar(range(X.shape[1]), importances[indices],
       color='r', yerr=std[indices], align='center')
plt.xticks(range(X.shape[1]), X.columns[indices], rotation=45, fontsize=20)
plt.xlim([-1, X.shape[1]])
plt.show()

# Test set 예측

In [None]:
preds = model.predict(test)

In [None]:
test2 = test.drop(drop_columns, axis=1)
preds2 = xgb.predict(test2)

In [None]:
preds3 = xgb_grid.predict(test)

# 제출파일 생성

In [None]:
submission = pd.read_csv('sample_submission.csv')
submission['target'] = preds

submission.to_csv('baseline.csv', index=False)

In [None]:
submission = pd.read_csv('./data/sample_submission.csv')
submission['target'] = preds2

submission.to_csv('result1.csv', index=False)

In [None]:
# grid search 적용
submission = pd.read_csv('./data/sample_submission.csv')
submission['target'] = preds3

submission.to_csv('result2.csv', index=False)