In [108]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import itertools
from sklearn.preprocessing import StandardScaler, FunctionTransformer, PolynomialFeatures
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import roc_auc_score
from lightgbm import LGBMClassifier

# 데이터 로드

### colab/local 인지에 따라 path 지정

In [109]:
import os

colab = os.path.isdir('./sample_data')
mount = os.path.isdir('/content/drive')
if colab:
    if not mount:
        from google.colab import drive
        drive.mount('/content/drive')
    base_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/dulee/'
    data_path = '/content/drive/MyDrive/Colab Notebooks/5_ML_Project/data/'
else:
    base_path = ''
    data_path = '../data/'

In [110]:
train_df = pd.read_csv(data_path+'train.csv', index_col='id')
test_df = pd.read_csv(data_path+'test.csv', index_col='id')
submission_df = pd.read_csv(data_path+'sample_submission.csv', index_col='id')

## Feature Engineering

##### 1. Original

In [111]:
train = train_df
test = test_df

X = train.drop(columns=['defects'])
y = train['defects']

##### 2. drop column (derived measure, usless measure)

In [112]:
# drop_columns = ['n', 'v', 'l', 'd', 'i', 'e', 'b', 't', 'lOCode', 'locCodeAndComment']
# drop_columns = ['n', 'v', 'l', 'd', 'i', 'e', 'b', 't', 'lOCode', 'locCodeAndComment', 'total_Op', 'total_Opnd', 'branchCount']
# drop_columns = ['locCodeAndComment']

# picked.keys()
# drop_columns = ['v(g)', 'iv(g)', 'v', 'l', 'd', 'i', 'e', 'b', 't', 'lOCode', 'total_Opnd', 'branchCount']

# train = train_df.drop(columns=drop_columns)
# test = test_df.drop(columns=drop_columns)
# X = train.drop(columns=['defects'])
# y = train['defects']

# X2 = train.drop(columns=['defects'])
# y2 = train['defects']


##### 3. McCabe's

In [113]:
# columns = ['loc', 'v(g)', 'ev(g)', 'iv(g)']
# train = train_df[columns + ['defects']]
# test = test_df[columns]

### Upsampling (SMOTE)

In [114]:
from imblearn.over_sampling import SMOTE
def upsampling_smote(df, indexname=None):
    smote = SMOTE(random_state=61)
    X = df.drop(columns=['defects'])
    y = df['defects']

    df_smote = pd.concat(smote.fit_resample(X, y), axis=1)
    df_smote.index.name = indexname
    print(df_smote.defects.value_counts())
    return df_smote

train_smote = upsampling_smote(train_df, train.index.name)

False    78699
True     78699
Name: defects, dtype: int64


### Upsampling (복원추출)

In [152]:
# 복원 추출을 통한 upsampling
def upsampling(df):
    df_0 = df[df.defects == 0] # majority class
    df_1 = df[df.defects == 1] # minority class
    idx = np.random.randint(len(df_1), size=len(df_0)) #index:복원추출
    return pd.concat([df_0, df_1.iloc[idx]])

train_up = upsampling(train_df)
train_up.defects.value_counts()

False    78699
True     78699
Name: defects, dtype: int64

### Upsampling (배수 scaling)

In [153]:
def upsampling_scaling(df):
    df_0 = df[df.defects == 0] # majority class
    df_1 = df[df.defects == 1] # minority class

    return pd.concat([df_0, df_1, df_1, df_1])

train_scaling = upsampling_scaling(train_df)
train_scaling.defects.value_counts()

False    78699
True     69192
Name: defects, dtype: int64

### DownSampling

In [137]:
# majority class를 minority class의 개수에 맞춰서 downsampling
def downsampling(df):
    df_0 = df[df.defects == 0] # majority class
    df_1 = df[df.defects == 1] # minority class

    idx = np.arange(len(df_0))
    np.random.seed(61)
    np.random.shuffle(idx)
    idx_down = idx[:df_1.defects.sum()]
    return pd.concat([df_0.iloc[idx_down], df_1])

train_down = downsampling(train_df)
print(train_down.defects.value_counts())

False    23064
True     23064
Name: defects, dtype: int64

### PCA 적용

In [117]:
from sklearn.decomposition import PCA

n_components = 18
pca = PCA(n_components=n_components,
            whiten=True,  # standardization
            random_state=61)

columns = [f'PC{i}' for i in range(n_components)]
X_pca = pd.DataFrame(pca.fit_transform(X), columns=columns)
test_pca = pd.DataFrame(pca.transform(test), columns=columns)

train_pca = pd.concat([X_pca, y], axis=1)
train_pca.index.name = 'id'
test_pca.index.name = 'id'
# display(train_pca)
# display(test_pca)

### Log Transform (모델에 따라서 선택)

In [118]:
# X = train.drop(columns=['defects'])
# y = train['defects']

In [119]:
# X = FunctionTransformer(np.log1p).fit_transform(X)
# test = FunctionTransformer(np.log1p).fit_transform(test)

# train = pd.concat([X, y], axis=1)

### feature selection

In [120]:
# def cross_val(model, col='None', X=X, y=y):
#     folds = StratifiedKFold(shuffle=True, random_state=61)
#     aucs = []
#     for train_idx, val_idx in folds.split(X, y):
#         model.fit(X.iloc[train_idx], y.iloc[train_idx])
#         y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
#         y_true = y.iloc[val_idx]
#         aucs.append(roc_auc_score(y_true, y_proba))
#     return np.mean(aucs)

# model = LGBMClassifier(random_state=61, n_jobs=-1, force_row_wise=True)
# cross_val(model)

In [121]:
# scores = {col:[] for col in ['None'] + list(X.columns)}

In [122]:
# def cross_val(model, col='None', X=X, y=y):
#     folds = StratifiedKFold(shuffle=True, random_state=61)
#     aucs = []
#     for train_idx, val_idx in folds.split(X, y):
#         model.fit(X.iloc[train_idx], y.iloc[train_idx])
#         y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
#         y_true = y.iloc[val_idx]
#         aucs.append(roc_auc_score(y_true, y_proba))
#     scores[col].append(np.mean(aucs))

# model = LGBMClassifier(random_state=61, n_jobs=-1, force_row_wise=True)
# # model = ExtraTreesClassifier(
# #     n_estimators=700,
# #     min_samples_leaf=2,
# #     max_depth=16,
# #     max_features=0.793614074795712,
# #     min_samples_split=48,
# #     random_state=61,
# #     n_jobs=-1,
# # )

# cross_val(model)
# for i, col in enumerate(X.columns):
#     print(f'\t\t{i+1}번째 column / {len(X.columns)}개')
#     X2 = X.drop(columns=[col])
#     cross_val(model, col, X2)

# # if False: # log변환후 재실행
# #     X = np.log1p(X)
# #     cross_val(model, X=X)
# #     for i, col in enumerate(X.columns):
# #         print(f'\t\t{i+1}번째 column / {len(X.columns)}개')
# #         X2 = X.drop(columns=[col])
# #         cross_val(model, col, X2)


In [123]:
# scores

In [124]:
# picked = {col:scores[col] for col in scores if scores[col] > scores['None']}
# print(len(picked) / len(scores))
# print(np.mean(list(picked.values())))

In [125]:
# scores2 = itertools.permutations(picked.keys(), 2)
# scores2 = {cols:[] for cols in scores2}

# def cross_val(model, cols='None', X=X, y=y):
#     folds = StratifiedKFold(shuffle=True, random_state=61)
#     aucs = []
#     for train_idx, val_idx in folds.split(X, y):
#         model.fit(X.iloc[train_idx], y.iloc[train_idx])
#         y_proba = model.predict_proba(X.iloc[val_idx])[:, 1]
#         y_true = y.iloc[val_idx]
#         aucs.append(roc_auc_score(y_true, y_proba))
#     scores2[cols].append(np.mean(aucs))

# model = LGBMClassifier(random_state=61, n_jobs=-1, force_row_wise=True)
# for i, cols in enumerate(scores2):
#     print(f'\t\t{i+1}번째 column / {len(scores2)}개')
#     X2 = X.drop(columns=list(cols))
#     cross_val(model, cols, X2)

In [126]:
# picked2 = {cols:scores2[cols] for cols in scores2 if scores2[cols] > scores['None']}
# print("향상된 점수 비율:", len(picked2) / len(scores2))
# print("평균 점수:", np.mean(list(picked2.values())))
# print("최댓값:", np.max(list(picked2.values())))

### data 저장

In [127]:
train.to_csv(base_path + 'train.csv')
test.to_csv(base_path + 'test.csv')

##### pca data

In [128]:
train_pca.to_csv(base_path + 'train_pca.csv')
test_pca.to_csv(base_path + 'test_pca.csv')

##### up/down(over/under) sampling data

In [154]:
train_smote.to_csv(base_path + 'train_smote.csv')

In [None]:
train_up.to_csv(base_path + 'train_up.csv')

In [None]:
train_scaling.to_csv(base_path + 'train_scaling.csv')

In [None]:
train_down.to_csv(base_path + 'train_down.csv')