In [None]:
# 해당 파일은 python 3.10.9에서 작성 되었습니다.

### 데이터 전처리

In [None]:
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
class0_data = pd.read_csv('./datasets/만료및탈퇴회원.csv', index_col = 0)
class1_data = pd.read_csv('./datasets/정회원.csv', index_col = 0)
display(class0_data.head(3), class1_data.head(3))

In [None]:
## 11월에 탈퇴한 회원의 10월 데이터 - 탈퇴회원 데이터
exit_user = class0_data[class0_data['mm'] == 11]['userid'].unique()
exit_data = class1_data[(class1_data['userid'].isin(exit_user)) & (class1_data['mm'] == 10)]

exit_data['target'] = 'WILL EXIT'

exit_data.head(3)

In [None]:
## 11월에 정회원인 회원의 10월 데이터 - 정회원 데이터
regular_user = class1_data[class1_data['mm'] == 11]['userid'].unique()
regular_data = class1_data[(class1_data['userid'].isin(regular_user)) & (class1_data['mm'] == 10)]

regular_data['target'] = 'WILL STAY'

regular_data.head(3)

In [None]:
dataset = pd.concat([exit_data, regular_data])

In [None]:
# "WILL STAY"는 1, "WILL EXIT"는 0으로 변환
dataset['target'] = dataset['target'].apply(lambda x: 1 if x == "WILL STAY" else 0)

In [None]:
# 테이블 컬럼 타입 확인
dataset.info()

In [None]:
# 칼럼별 결측지 비율 확인
null_check_df = dataset
null_cnt_df = pd.DataFrame(null_check_df.isnull().sum()).rename(columns = {0:'null_count'}).reset_index()
null_cnt_df['null_ratio'] = round(null_cnt_df['null_count'] / len(null_check_df) * 100, 2)
null_cnt_df

In [None]:

print('(rows, columns) :', dataset[dataset['wrong_count'] == 0].shape) # (행 개수, 열 개수)
print('(rows, columns) :', dataset[dataset['wrong_item_count'] == 0].shape) # (행 개수, 열 개수)
print('(rows, columns) :', dataset[dataset['wrong_correct_count'] == 0].shape) # (행 개수, 열 개수)

In [None]:
dataset['wrong_correct_count'].unique()

In [None]:
dataset['wrong_count'] = dataset['wrong_count'].fillna(-1)
dataset['wrong_item_count'] = dataset['wrong_item_count'].fillna(-1)
dataset['wrong_correct_count'] = dataset['wrong_correct_count'].fillna(-1)

In [None]:
dataset['target'].value_counts()

In [None]:
dataset = dataset.dropna(subset = ['point_gain_activeday_count', 'point_gain_count', 'point_gain',
             'point_loss_activeday_count', 'point_loss_count', 'point_loss',
             'tablet_activeday_count', 'tablet_moved_menu_count', 'tablet_leave_count', 'tablet_resume_count',
             'tablet_login_count', 'tablet_logout_count', 'study_activeday_count', 'study_count',
             'study_notcompleted_count', 'study_completed_count',
             'study_restart_count', 'total_system_learning_time', 'total_caliper_learning_time',
             'media_activeday_count', 'media_count',
             'video_action_count', 'video_start_count', 'video_restart_count',
             'video_pause_count', 'video_jump_count', 'video_resume_count',
             'video_speed_count', 'video_volume_count', 'video_end_count',
             'test_activeday_count', 'test_count', 'test_average_score',
             'test_item_count', 'test_correct_count', 'wrong_count',
             'wrong_item_count', 'wrong_correct_count'
             ])

In [None]:
X = dataset[['point_gain_activeday_count', 'point_gain_count', 'point_gain',
             'point_loss_activeday_count', 'point_loss_count', 'point_loss',
             'tablet_activeday_count', 'tablet_moved_menu_count', 'tablet_leave_count', 'tablet_resume_count',
             'tablet_login_count', 'tablet_logout_count', 'study_activeday_count', 'study_count',
             'study_notcompleted_count', 'study_completed_count',
             'study_restart_count', 'total_system_learning_time', 'total_caliper_learning_time',
             'media_activeday_count', 'media_count',
             'video_action_count', 'video_start_count', 'video_restart_count',
             'video_pause_count', 'video_jump_count', 'video_resume_count',
             'video_speed_count', 'video_volume_count', 'video_end_count',
             'test_activeday_count', 'test_count', 'test_average_score',
             'test_item_count', 'test_correct_count', 'wrong_count',
             'wrong_item_count', 'wrong_correct_count'
             ]]

In [None]:
Y = dataset['target']

### 모델 적용

결정트리, 보팅, 배깅, 랜덤포레스트, 그래디언트 부스팅 모델을 사용해보고,

각 모델의 **정확도**, **재현율**, **정밀도**, **AUC**, **Confusion matrix**를 출력해 보세요.

### Decision Tree(결정 트리)

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
dt_clf = DecisionTreeClassifier(random_state = 156)
dt_clf

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X, Y, test_size = 0.2,  random_state = 11)

In [None]:
dt_clf.fit(X_train , y_train)

In [None]:
import sklearn.metrics as mt

In [None]:
y_pred = dt_clf.predict(X_test)
accuracy = mt.accuracy_score(y_test, y_pred)
matrix = mt.confusion_matrix(y_test, y_pred)


print("Decision Tree 정확도 : {:.4f}".format(accuracy))
print('Decision Tree Confusion Matrix :', '\n', matrix)

### Voting(보팅)

In [None]:
import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 개별 모델은 KNN와 DecisionTree 임.
knn_clf = KNeighborsClassifier(n_neighbors = 8)
dt_clf = DecisionTreeClassifier(random_state = 42)

# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현한 분류기
vo_clf = VotingClassifier(estimators = [('KNN', knn_clf), ('DT', dt_clf)], voting = 'soft' )

In [None]:
# VotingClassifier 학습/예측/평가.
vo_clf.fit(X_train, y_train)

In [None]:
pred = vo_clf.predict(X_test)
print('Voting 분류기 정확도 : {0:.4f}'.format(accuracy_score(y_test, pred)))

In [None]:
# 개별 모델의 학습/예측/평가.
classifiers = [knn_clf, dt_clf]
for classifier in classifiers :
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    class_name = classifier.__class__.__name__
    print('{0} 정확도 : {1:.4f}'.format(class_name, accuracy_score(y_test, pred)))

#### Bagging(배깅)

In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, confusion_matrix
lr_clf = LogisticRegression(solver = 'liblinear')

bagging_clf = BaggingClassifier(base_estimator = lr_clf)

# BaggingClassifier 학습/예측.
bagging_clf.fit(X_train, y_train)
pred = bagging_clf.predict(X_test)

In [None]:
y_test.unique()

In [None]:
# 평가
accuracy = accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
auc = roc_auc_score(y_test, pred)
matrix = confusion_matrix(y_test, pred)

print('Bagging 분류기 정확도 : {0:.4f}'.format(accuracy))
print('Bagging 분류기 Recall : {0:.4f}'.format(recall))
print('Bagging 분류기 Precision : {0:.4f}'.format(precision))
print('Bagging 분류기 AUC : {0:.4f}'.format(auc))
print('Bagging 분류기 Confusion Matrix :', '\n', matrix)

#### 랜덤 포레스트

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(n_estimators = 100, random_state = 0, max_depth = 8)
rf_clf.fit(X_train , y_train)
pred = rf_clf.predict(X_test)

accuracy = accuracy_score(y_test , pred)
recall = recall_score(y_test, pred)
precision = precision_score(y_test, pred)
auc = roc_auc_score(y_test, pred)
matrix = confusion_matrix(y_test, pred)

print('랜덤 포레스트 정확도 : {0:.4f}'.format(accuracy))
print('랜덤 포레스트 Recall : {0:.4f}'.format(recall))
print('랜덤 포레스트 Precision : {0:.4f}'.format(precision))
print('랜덤 포레스트 AUC : {0:.4f}'.format(auc))
print('랜덤 포레스트 Confusion Matrix :', '\n', matrix)

In [None]:
ftr_importances_values = rf_clf.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = X_train.columns)
ftr_importances.sort_values(ascending = False)[:20]

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = X_train.columns  )
ftr_top20 = ftr_importances.sort_values(ascending = False)[:20]

plt.figure(figsize = (8, 6))
plt.title('Feature importances Top 20')
sns.barplot(x = ftr_top20 , y = ftr_top20.index)
plt.show()

#### 그래디언트 부스팅

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
import time
import warnings
warnings.filterwarnings('ignore')

# GBM 수행 시간 측정을 위함. 시작 시간 설정.
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state = 0)
gb_clf.fit(X_train, y_train)
gb_pred = gb_clf.predict(X_test)

accuracy = accuracy_score(y_test, gb_pred)
recall = recall_score(y_test, gb_pred)
precision = precision_score(y_test, gb_pred)
auc = roc_auc_score(y_test, gb_pred)
matrix = confusion_matrix(y_test, gb_pred)

print('GBM 정확도 : {0:.4f}'.format(accuracy))
print('GBM Recall : {0:.4f}'.format(recall))
print('GBM Precision : {0:.4f}'.format(precision))
print('GBM AUC : {0:.4f}'.format(auc))
print('GBM Confusion Matrix :', '\n', matrix)

print("GBM 수행 시간 : {0:.1f} 초 ".format(time.time() - start_time))