1. 연속형 vs 연속형 일떄는 피어슨 상관계수를 쓴다. label이 명목형일때는 연관성을 어떻게 측정하는지 고찰 하고,  스피어만 상관계수, 피어슨 상관계수가 뭔지 알아보자
2. k-means를 통한 클러스터 파생변수 추가
3. 파이캐럿 AutoML을 돌려서 상위 3개 모델을 선정
4. catBoost를 블렌더 모델로 선정하여 전방 모델은 2번 상위모델 3개로 배치 후 스태킹 
5. 학습하여 test.csv를 찍어서 submission.csv를 제출

In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.metrics import f1_score
import catboost as ctb
import lightgbm as lgb
import os
from scipy.stats import spearmanr # 통계 라이브러리
import datetime
import warnings
warnings.filterwarnings('ignore')

In [3]:
# 데이터 불러오기
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
# 상관 관계 확인 ID & 나머지 columns / y = support_needs

In [4]:
train_df.columns
# ['ID', 'age', 'gender', 'tenure', 'frequent', 'payment_interval', 'subscription_type', 'contract_length', 'after_interaction']
# ['support_needs']

Index(['ID', 'age', 'gender', 'tenure', 'frequent', 'payment_interval',
       'subscription_type', 'contract_length', 'after_interaction',
       'support_needs'],
      dtype='object')

In [5]:
test_df.columns.tolist()

['ID',
 'age',
 'gender',
 'tenure',
 'frequent',
 'payment_interval',
 'subscription_type',
 'contract_length',
 'after_interaction']

In [6]:
X = train_df.drop(columns=['ID', 'support_needs'])
y = train_df['support_needs']

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) 
# test_df는 따로 존재

In [8]:
train_df['subscription_type'].value_counts(), train_df['gender'].value_counts()
# 방법 1. 명목변수로 모두 바꾼다. -> [M=0, F=1], [vip =0,  plus =1, member =2], 
# 방법 2. 문자열로 유지한다. 

(plus      10481
 vip       10405
 member     9972
 Name: subscription_type, dtype: int64,
 M    17432
 F    13426
 Name: gender, dtype: int64)

In [9]:
# 방법 1. 명목변수로 모두 변경
train_df['gender'] = train_df['gender'].map({'M': 0, 'F': 1})
train_df['subscription_type'] = train_df['subscription_type'].map({'member': 0, 'plus': 1, 'vip': 2})

test_df['gender'] = test_df['gender'].map({'M': 0, 'F': 1})
test_df['subscription_type'] = test_df['subscription_type'].map({'member': 0, 'plus': 1, 'vip': 2})

In [None]:
# -------------------------------------------------구분선 | 방법 1 수행시 방법 2는 수행하면 안됨 !! ----------------------------------------------------------------------------

In [None]:
# 방법2. pycaret 사용시 문자열로 유지
from pycaret.classification import *

train_df['gender'] = train_df['gender'].astype(str)
train_df['subscription_type'] = train_df['subscription_type'].astype(str)
test_df['gender'] = test_df['gender'].astype(str)
test_df['subscription_type'] = test_df['subscription_type'].astype(str)

In [10]:
train_df.head(), test_df.head()

(            ID   age  gender  tenure  frequent  payment_interval  \
 0  TRAIN_00000  54.0       1    47.0      22.0               8.0   
 1  TRAIN_00001  30.0       0    16.0      15.0               5.0   
 2  TRAIN_00002  29.0       0     8.0      30.0              21.0   
 3  TRAIN_00003  38.0       1    38.0      23.0              10.0   
 4  TRAIN_00004  25.0       1    52.0       3.0              17.0   
 
    subscription_type  contract_length  after_interaction  support_needs  
 0                  0               90               25.0              0  
 1                  2              360               23.0              0  
 2                  1               30               21.0              0  
 3                  2               90                6.0              0  
 4                  0               30                1.0              2  ,
            ID   age  gender  tenure  frequent  payment_interval  \
 0  TEST_00000  18.0       0    40.0       6.0              15.0 

In [11]:
train_df.columns.tolist(), test_df.columns.tolist()

(['ID',
  'age',
  'gender',
  'tenure',
  'frequent',
  'payment_interval',
  'subscription_type',
  'contract_length',
  'after_interaction',
  'support_needs'],
 ['ID',
  'age',
  'gender',
  'tenure',
  'frequent',
  'payment_interval',
  'subscription_type',
  'contract_length',
  'after_interaction'])

In [12]:
train_df.isnull().sum()

ID                   0
age                  0
gender               0
tenure               0
frequent             0
payment_interval     0
subscription_type    0
contract_length      0
after_interaction    0
support_needs        0
dtype: int64

In [13]:
test_df.isnull().sum()

ID                   0
age                  0
gender               0
tenure               0
frequent             0
payment_interval     0
subscription_type    0
contract_length      0
after_interaction    0
dtype: int64

In [17]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=42)

train_df['cluster'] = kmeans.fit_predict(train_df[['age', 'gender', 'subscription_type', 'tenure',  'frequent',  'payment_interval', 'contract_length', 'after_interaction']])

test_df['cluster'] = kmeans.predict(test_df[['age', 'gender', 'subscription_type', 'tenure',  'frequent', 'payment_interval', 'contract_length',  'after_interaction']])

In [None]:
from pycaret.classification import setup, compare_models, blend_models, finalize_model, predict_model, create_model, tune_model

clf_setup = setup( data=train_df,
    target='support_needs',
    session_id=42,
    normalize=True,
    fix_imbalance=True,  ) # “샘플링(Sampling)” 또는 “가중치 조정”을 통해 데이터 불균형을 교정
#클래스 불균형(Class Imbalance) 문제를 자동으로 교정해주는 설정

Unnamed: 0,Description,Value
0,Session id,42
1,Target,support_needs
2,Target type,Multiclass
3,Original data shape,"(30858, 11)"
4,Transformed data shape,"(39279, 11)"
5,Transformed train set shape,"(30021, 11)"
6,Transformed test set shape,"(9258, 11)"
7,Numeric features,9
8,Categorical features,1
9,Preprocess,True


In [19]:
top3 = compare_models(n_select=3, sort='AUC')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.2689,0.6328,0.2689,0.0723,0.114,0.0,0.0,0.272
xgboost,Extreme Gradient Boosting,0.2689,0.5495,0.2689,0.0723,0.114,0.0,0.0,0.412
et,Extra Trees Classifier,0.2689,0.5382,0.2689,0.0723,0.114,0.0,0.0,0.248
lightgbm,Light Gradient Boosting Machine,0.3661,0.5193,0.3661,0.1434,0.2036,0.0,0.0,0.206
catboost,CatBoost Classifier,0.2689,0.5191,0.2689,0.0723,0.114,0.0,0.0,9.228
knn,K Neighbors Classifier,0.2874,0.5134,0.2874,0.4076,0.165,0.0119,0.0346,0.274
gbc,Gradient Boosting Classifier,0.2689,0.5001,0.2689,0.0723,0.114,0.0,0.0,1.452
nb,Naive Bayes,0.2689,0.5,0.2689,0.0723,0.114,0.0,0.0,0.085
dt,Decision Tree Classifier,0.2689,0.5,0.2689,0.0723,0.114,0.0,0.0,0.087
ada,Ada Boost Classifier,0.2689,0.5,0.2689,0.0723,0.114,0.0,0.0,0.3


In [20]:
tuned_top3 = [tune_model(m) for m in top3]

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.2685,0.624,0.2685,0.0721,0.1137,0.0,0.0
1,0.2685,0.657,0.2685,0.0721,0.1137,0.0,0.0
2,0.269,0.6472,0.269,0.0724,0.114,0.0,0.0
3,0.269,0.6266,0.269,0.0724,0.114,0.0,0.0
4,0.269,0.6181,0.269,0.0724,0.114,0.0,0.0
5,0.269,0.6383,0.269,0.0724,0.114,0.0,0.0
6,0.269,0.6374,0.269,0.0724,0.114,0.0,0.0
7,0.269,0.6428,0.269,0.0724,0.114,0.0,0.0
8,0.269,0.6239,0.269,0.0724,0.114,0.0,0.0
9,0.269,0.6195,0.269,0.0724,0.114,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.2685,0.6359,0.2685,0.0721,0.1137,0.0,0.0
1,0.2685,0.661,0.2685,0.0721,0.1137,0.0,0.0
2,0.269,0.6595,0.269,0.0724,0.114,0.0,0.0
3,0.269,0.6323,0.269,0.0724,0.114,0.0,0.0
4,0.269,0.6427,0.269,0.0724,0.114,0.0,0.0
5,0.269,0.6541,0.269,0.0724,0.114,0.0,0.0
6,0.269,0.6536,0.269,0.0724,0.114,0.0,0.0
7,0.269,0.6585,0.269,0.0724,0.114,0.0,0.0
8,0.269,0.6412,0.269,0.0724,0.114,0.0,0.0
9,0.269,0.6489,0.269,0.0724,0.114,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.2685,0.5,0.2685,0.0721,0.1137,0.0,0.0
1,0.2685,0.5,0.2685,0.0721,0.1137,0.0,0.0
2,0.269,0.5,0.269,0.0724,0.114,0.0,0.0
3,0.269,0.5,0.269,0.0724,0.114,0.0,0.0
4,0.269,0.5,0.269,0.0724,0.114,0.0,0.0
5,0.269,0.5,0.269,0.0724,0.114,0.0,0.0
6,0.269,0.5,0.269,0.0724,0.114,0.0,0.0
7,0.269,0.5,0.269,0.0724,0.114,0.0,0.0
8,0.269,0.5,0.269,0.0724,0.114,0.0,0.0
9,0.269,0.5,0.269,0.0724,0.114,0.0,0.0


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


In [21]:
blender = blend_models(estimator_list=tuned_top3, fold=5)
catboost_final = finalize_model(blender)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.2688,0.5548,0.2688,0.0722,0.1139,0.0,0.0
1,0.2688,0.5855,0.2688,0.0722,0.1139,0.0,0.0
2,0.269,0.546,0.269,0.0724,0.114,0.0,0.0
3,0.269,0.5677,0.269,0.0724,0.114,0.0,0.0
4,0.269,0.5349,0.269,0.0724,0.114,0.0,0.0
Mean,0.2689,0.5578,0.2689,0.0723,0.114,0.0,0.0
Std,0.0001,0.0175,0.0001,0.0001,0.0001,0.0,0.0


In [22]:
preds = predict_model(catboost_final, data=test_df)

In [23]:
preds[['ID', 'prediction_label']].rename(columns={'prediction_label': 'predicted_support_needs'}
                                        ).to_csv('submission.csv', index=False)
print(" submission saved")

 submission saved
