In [None]:
!pip install bayesian-optimization
!pip install keras-tuner
!pip install catboost
!pip install vecstack
!pip install pycaret

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import warnings
warnings.filterwarnings("ignore")
from os import path
import pickle

from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from bayes_opt import BayesianOptimization

from sklearn.metrics import make_scorer
from sklearn.metrics import log_loss

from sklearn.feature_selection import SelectPercentile

from pycaret.classification import *
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

In [None]:
data_num = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data1.csv', index_col=0)
data_te_num = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te1.csv', index_col=0)
#data_prop = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_prop.csv', index_col=0)
#data_te_prop = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_prop.csv', index_col=0)
#data_pca = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_pca.csv', index_col=0)
#data_te_pca = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_pca.csv', index_col=0)
y_target = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/y_train.csv')

### Scaling Numeric Features

In [None]:
std_sc = StandardScaler()
mm_sc = MinMaxScaler()
ma_sc = MaxAbsScaler()
rob_sc = RobustScaler()
pt = PowerTransformer()
scalers = [std_sc, mm_sc, ma_sc, rob_sc, pt]
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
for scaler in scalers:
    model = LogisticRegression()
    data = scaler.fit_transform(data_num)
    skf = StratifiedKFold(random_state=0, shuffle=True)
    score = cross_val_score(model, data, y_target.LABEL, scoring=myscore, cv=skf, n_jobs=-1)
    print(score.mean(), score)

In [None]:
scaler = MaxAbsScaler()
data = scaler.fit_transform(data_num.iloc[:,1:])
data_te = scaler.transform(data_te_num.iloc[:,1:])

In [None]:
data = pd.concat([pd.DataFrame({'CLNT_ID':data_num.CLNT_ID}),pd.DataFrame(data, columns=data_num.columns[1:])], axis=1)
data_te = pd.concat([pd.DataFrame({'CLNT_ID':data_te_num.CLNT_ID}),pd.DataFrame(data_te, columns=data_num.columns[1:])], axis=1)

In [None]:
data.to_csv('/content/drive/MyDrive/D&A_ML_Competition/data_num_scaled.csv')
data_te.to_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_num_scaled.csv')

### Merging Data

In [None]:
data_num = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_num_scaled.csv', index_col=0)
data_prop = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_prop.csv', index_col=0)
data_pca = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_pca.csv', index_col=0)
data_w2v = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_w2v.csv')

train_f = pd.merge(data_num,data_prop, on='CLNT_ID', how='inner')
train_f = pd.concat([train_f,data_w2v], axis=1)
train_f = pd.merge(train_f,data_pca, on='CLNT_ID', how='inner').set_index('CLNT_ID')

train_f.to_csv('/content/drive/MyDrive/D&A_ML_Competition/data_final.csv', index=False)

In [None]:
data_te_num = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_num_scaled.csv', index_col=0)
data_te_prop = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_prop.csv', index_col=0)
data_te_pca = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_pca.csv', index_col=0)
data_te_w2v = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_w2v.csv')

test_f = pd.merge(data_te_num,data_te_prop, on='CLNT_ID', how='inner')
test_f = pd.merge(test_f,data_te_pca, on='CLNT_ID', how='inner').set_index('CLNT_ID')
test_f = pd.concat([test_f,data_te_w2v], axis=1)

test_f.to_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_final.csv', index=False)

### Select Percentile

- (5, -1.2697781457090822)
- (6, -1.2618158715545253)
- (7, -1.2588411271810709)
- (8, -1.2554304963670613)
- (50, -1.2208101692869557)
- (60, -1.2195901393961885)
- (65, -1.219445516774713)
- (67, -1.2199717428311054)
- (68, -1.2193700738508904)
- (70, -1.219957720318273)
- (80, -1.2227003309437798)

In [None]:
train_f = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_final.csv')
#test_f = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/data_te_final.csv')
y_target = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/y_train.csv')

In [None]:
myscore = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
model = LogisticRegression(random_state=0)

cv_scores = []
for p in tqdm(range(67,68,1)):
    X_new = SelectPercentile(percentile=p).fit_transform(train_f, y_target.LABEL)    
    cv_score = cross_val_score(model, X_new, y_target.LABEL, scoring=myscore, cv=5).mean()
    cv_scores.append((p,cv_score))
    print((p,cv_score))

# Print the best percentile
best_score = cv_scores[np.argmax([score for _, score in cv_scores])]
print(best_score)

# Plot the performance change with p
plt.plot([k for k, _ in cv_scores], [score for _, score in cv_scores])
plt.xlabel('Percent of features')
plt.grid()

In [None]:
fs = SelectPercentile(percentile=68).fit(train_f, y_target.LABEL)

In [None]:
print(train_f.columns[fs.get_support()].tolist())

In [None]:
fs = SelectPercentile(percentile=68).fit(train_f, y_target.LABEL)
X_train = fs.transform(train_f)
X_test = fs.transform(test_f)

print(X_train.shape)
print(train_f.columns[fs.get_support()].tolist())

In [None]:
X_train = pd.DataFrame(X_train, columns=train_f.iloc[:,:].columns[fs.get_support()].tolist())
X_train.to_csv('/content/drive/MyDrive/D&A_ML_Competition/X_train_selectp.csv', index=False)

In [None]:
X_test = pd.DataFrame(X_test, columns=train_f.iloc[:,:].columns[fs.get_support()].tolist())
X_test.to_csv('/content/drive/MyDrive/D&A_ML_Competition/X_test_selectp.csv', index=False)

In [None]:
model = LogisticRegression()
model.fit(X_train, y_target.LABEL)

In [None]:
test = pd.read_csv('/content/drive/MyDrive/D&A_ML_Competition/L.POINT_test.csv', encoding='UTF-8')

In [None]:
# 테스트 데이터 예측
pred = pd.DataFrame(model.predict_proba(X_test))

# 결과값 정제 및 내보내기
result = pd.concat([test.CLNT_ID.drop_duplicates().reset_index(drop=True), pred], axis=1)
result.columns = ['CLNT_ID','F20','F30','F40','M20','M30','M40']
result

result.to_csv('/content/drive/MyDrive/D&A_ML_Competition/submission_selectp.csv',index=False)