In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

# 데이터 로드

### colab/local 인지에 따라 path 지정

In [70]:
import os

colab = os.path.isdir('./sample_data')
mount = os.path.isdir('/content/drive')
if colab:
    if not mount:
        from google.colab import drive
        drive.mount('/content/drive')
    base_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/dulee/'
    data_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/data/'
else:
    base_path = ''
    data_path = '../data/'

In [71]:
train_df = pd.read_csv(data_path+'train.csv', index_col='id')
test_df = pd.read_csv(data_path+'test.csv', index_col='id')
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

## Feature Engineering

##### 1. Original

In [72]:
train = train_df
test = test_df

X = train.drop(columns=['defects'])
y = train['defects']

##### 2. drop column (derived measure, usless measure)

In [73]:
# drop_columns = ['n', 'v', 'l', 'd', 'i', 'e', 'b', 't', 'lOCode', 'locCodeAndComment']
# drop_columns = ['n', 'v', 'l', 'd', 'i', 'e', 'b', 't', 'lOCode', 'locCodeAndComment', 'total_Op', 'total_Opnd', 'branchCount']
# drop_columns = ['locCodeAndComment']

# picked.keys()
# drop_columns = ['v(g)', 'iv(g)', 'v', 'l', 'd', 'i', 'e', 'b', 't', 'lOCode', 'total_Opnd', 'branchCount']

# train = train_df.drop(columns=drop_columns)
# test = test_df.drop(columns=drop_columns)
# X = train.drop(columns=['defects'])
# y = train['defects']

# X2 = train.drop(columns=['defects'])
# y2 = train['defects']


##### 3. McCabe's

In [74]:
# columns = ['loc', 'v(g)', 'ev(g)', 'iv(g)']
# train = train_df[columns + ['defects']]
# test = test_df[columns]

##### PCA 적용

In [75]:
feature_reducing = True

# PCA 적용
if feature_reducing:
    from sklearn.decomposition import PCA

    n_components = 18
    pca = PCA(n_components=n_components,
              whiten=True,  # standardization
              random_state=61)

    columns = [f'PC{i}' for i in range(n_components)]
    X_pca = pd.DataFrame(pca.fit_transform(X), columns=columns)
    test = pd.DataFrame(pca.transform(test), columns=columns)
    display(X_pca)
    display(test)

    train = pd.concat([X_pca, y], axis=1)

Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17
0,-0.080883,-0.035461,-0.191837,0.087115,-0.164247,0.068386,0.463299,0.107671,0.695480,0.182275,0.876277,-0.316784,-0.289953,0.659809,0.169979,-0.411128,-0.249313,-0.151568
1,-0.104576,-0.047122,-0.301289,0.307408,-0.237905,-0.017678,-0.111971,-0.065919,0.109313,0.219692,-0.130064,0.277045,-0.062815,0.275391,0.249933,-0.265278,-0.070473,-0.080560
2,-0.100283,-0.044817,-0.257411,0.196582,-0.416844,0.051281,0.085892,-0.022209,-0.175509,0.119523,-0.002917,0.256940,-0.102181,0.371445,0.422736,-0.305637,-0.074462,-0.089533
3,-0.107008,-0.048675,-0.357195,0.361040,-0.363023,0.008502,0.255656,0.022585,-0.217884,0.439569,-0.215421,0.147959,0.109576,-0.179642,0.080975,0.110724,0.141325,0.136539
4,-0.107576,-0.049193,-0.389907,0.379323,-0.200785,-0.006920,0.151998,0.372454,0.250420,0.399016,-0.343158,0.315190,0.181468,-0.256071,-0.041495,0.394411,0.030299,0.035634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
101758,-0.105233,-0.047645,-0.329379,0.333097,-0.325322,-0.024639,0.063313,0.121252,0.018345,0.218122,-0.134272,0.134348,0.011187,0.164467,0.360363,-0.460309,-0.189668,0.040013
101759,-0.050322,-0.019501,0.065853,-0.137550,-0.362805,0.172067,-0.009222,-0.418313,-0.043818,-0.406282,0.470010,-0.119167,-0.531525,0.349938,0.113467,-1.414834,0.175262,0.115060
101760,-0.078545,-0.033744,-0.110084,-0.109370,-0.209296,0.039613,0.697188,0.240948,0.691643,0.810831,1.014043,-0.297734,-0.461069,-0.382392,-1.022663,0.778104,0.489586,0.217472
101761,-0.101438,-0.045386,-0.262526,0.217192,-0.452358,0.020985,-0.323957,-0.098577,-0.432724,-0.056858,-0.118315,0.297137,-0.100530,0.361185,0.225037,-0.155251,-0.063234,-0.015300


Unnamed: 0,PC0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10,PC11,PC12,PC13,PC14,PC15,PC16,PC17
0,0.009372,0.009380,0.280029,-0.190196,-0.444819,0.132227,0.173571,-0.046532,0.471011,-0.346345,1.257988,0.002587,-0.833277,0.689129,0.428152,-0.423891,-0.357097,-0.419372
1,-0.033445,-0.011239,0.138335,-0.204636,-0.419812,-0.052879,0.219896,0.287907,0.371753,-0.946717,0.469440,-0.233378,-0.454990,0.210280,-0.202030,-1.433511,0.963291,0.322542
2,0.500468,0.253547,2.872776,-0.780208,-0.584673,0.434172,0.196000,1.756597,-0.208741,0.596923,1.358720,0.326920,-0.244058,0.359177,0.714157,-0.199681,1.250497,-0.368906
3,-0.024761,-0.005659,0.343088,-0.406894,0.390217,-0.087801,-0.786955,-0.996104,-0.062362,0.066447,-0.019014,-0.543279,0.804216,0.049994,0.475910,0.968804,-0.649062,-0.345745
4,-0.095712,-0.042436,-0.219543,0.119331,-0.137737,0.059759,-0.333455,-0.128994,0.080331,0.066500,0.011804,0.220291,-0.085388,-0.204291,0.460226,0.136392,-0.073965,-0.087773
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67837,-0.064265,-0.024824,0.176262,-0.271467,-0.166419,0.107717,-0.992454,-0.592247,-0.396336,0.497513,0.484481,0.261523,-0.509702,-0.503360,0.024272,-0.291742,0.367026,0.236249
67838,-0.103923,-0.046738,-0.290512,0.204637,-0.085545,0.019154,-0.105470,-0.227017,0.100289,0.276423,-0.454427,0.027834,-0.040911,-0.176045,0.153831,0.366670,-0.121147,-0.075323
67839,-0.098151,-0.043514,-0.215645,0.099508,-0.094871,0.168099,-0.104409,-0.647669,0.055035,0.263585,0.247004,0.356083,-0.111871,-0.075334,1.107136,-0.836783,-0.042001,-0.180920
67840,-0.098370,-0.043804,-0.239381,0.145097,-0.243819,0.057120,-0.131114,0.075000,-0.080978,0.303435,-0.055697,0.200093,-0.135861,-0.145750,-0.600393,0.217410,-0.011354,0.173546


### Log Transform (모델에 따라서 선택)

In [77]:
# X = train.drop(columns=['defects'])
# y = train['defects']

In [78]:
# X = FunctionTransformer(np.log1p).fit_transform(X)
# test = FunctionTransformer(np.log1p).fit_transform(test)

# train = pd.concat([X, y], axis=1)

##### validation 점수

In [79]:
# def cross_val(model, col='None', X=X, y=y):
#     folds = StratifiedKFold(shuffle=True, random_state=61)
#     aucs = []
#     for train_idx, val_idx in folds.split(X, y):
#         model.fit(X.iloc[train_idx], y.iloc[train_idx])
#         y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
#         y_true = y.iloc[val_idx]
#         aucs.append(roc_auc_score(y_true, y_proba))
#     return np.mean(aucs)

# model = LGBMClassifier(random_state=61, n_jobs=-1, force_row_wise=True)
# cross_val(model)

In [80]:
# scores = {col:[] for col in ['None'] + list(X.columns)}

In [81]:
# def cross_val(model, col='None', X=X, y=y):
#     folds = StratifiedKFold(shuffle=True, random_state=61)
#     aucs = []
#     for train_idx, val_idx in folds.split(X, y):
#         model.fit(X.iloc[train_idx], y.iloc[train_idx])
#         y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
#         y_true = y.iloc[val_idx]
#         aucs.append(roc_auc_score(y_true, y_proba))
#     scores[col].append(np.mean(aucs))

# model = LGBMClassifier(random_state=61, n_jobs=-1, force_row_wise=True)
# # model = ExtraTreesClassifier(
# #     n_estimators=700,
# #     min_samples_leaf=2,
# #     max_depth=16,
# #     max_features=0.793614074795712,
# #     min_samples_split=48,
# #     random_state=61,
# #     n_jobs=-1,
# # )

# cross_val(model)
# for i, col in enumerate(X.columns):
#     print(f'\t\t{i+1}번째 column / {len(X.columns)}개')
#     X2 = X.drop(columns=[col])
#     cross_val(model, col, X2)

# # if False: # log변환후 재실행
# #     X = np.log1p(X)
# #     cross_val(model, X=X)
# #     for i, col in enumerate(X.columns):
# #         print(f'\t\t{i+1}번째 column / {len(X.columns)}개')
# #         X2 = X.drop(columns=[col])
# #         cross_val(model, col, X2)


In [82]:
# scores

In [83]:
# picked = {col:scores[col] for col in scores if scores[col] > scores['None']}
# print(len(picked) / len(scores))
# print(np.mean(list(picked.values())))

In [84]:
# scores2 = itertools.permutations(picked.keys(), 2)
# scores2 = {cols:[] for cols in scores2}

# def cross_val(model, cols='None', X=X, y=y):
#     folds = StratifiedKFold(shuffle=True, random_state=61)
#     aucs = []
#     for train_idx, val_idx in folds.split(X, y):
#         model.fit(X.iloc[train_idx], y.iloc[train_idx])
#         y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
#         y_true = y.iloc[val_idx]
#         aucs.append(roc_auc_score(y_true, y_proba))
#     scores2[cols].append(np.mean(aucs))

# model = LGBMClassifier(random_state=61, n_jobs=-1, force_row_wise=True)
# for i, cols in enumerate(scores2):
#     print(f'\t\t{i+1}번째 column / {len(scores2)}개')
#     X2 = X.drop(columns=list(cols))
#     cross_val(model, cols, X2)

In [85]:
# picked2 = {cols:scores2[cols] for cols in scores2 if scores2[cols] > scores['None']}
# print("향상된 점수 비율:", len(picked2) / len(scores2))
# print("평균 점수:", np.mean(list(picked2.values())))
# print("최댓값:", np.max(list(picked2.values())))

### data 저장

In [86]:
train.to_csv(base_path + 'train.csv')
test.to_csv(base_path + 'test.csv')