In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

In [31]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from lightgbm import LGBMClassifier
import pickle

In [32]:
df = pd.read_csv("./data/train.csv")
# 특징들 목록 index 로 저장
features = df.columns
features_handle = features[3:]
print(features_handle)
# 전처리 다룰 데이터 모음
data_handle = df.iloc[:,3:]
data_handle.head(3)

Index(['psfMag_u', 'psfMag_g', 'psfMag_r', 'psfMag_i', 'psfMag_z',
       'fiberMag_u', 'fiberMag_g', 'fiberMag_r', 'fiberMag_i', 'fiberMag_z',
       'petroMag_u', 'petroMag_g', 'petroMag_r', 'petroMag_i', 'petroMag_z',
       'modelMag_u', 'modelMag_g', 'modelMag_r', 'modelMag_i', 'modelMag_z'],
      dtype='object')


Unnamed: 0,psfMag_u,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_u,fiberMag_g,fiberMag_r,fiberMag_i,fiberMag_z,petroMag_u,petroMag_g,petroMag_r,petroMag_i,petroMag_z,modelMag_u,modelMag_g,modelMag_r,modelMag_i,modelMag_z
0,23.198224,21.431953,21.314148,21.176553,21.171444,22.581309,21.644453,21.657571,21.387653,21.572827,22.504317,21.431636,21.478312,21.145409,20.422446,22.749241,21.465534,21.364187,21.020605,21.14734
1,21.431355,20.708104,20.67885,20.70342,20.473229,21.868797,21.029773,20.967054,20.937731,21.063646,21.360701,20.778968,20.889705,20.639812,20.64666,21.492955,20.758527,20.753925,20.693389,20.512314
2,17.851451,16.727898,16.679677,16.69464,16.641788,18.17189,17.033098,16.999682,17.095999,17.076449,17.867253,16.738784,16.688874,16.74421,16.808006,17.818063,16.697434,16.641249,16.660177,16.688928


In [33]:
# 평균 표준편차 중앙값 특징이름별로 저장
# 0 : 평균 / 1 : 표준편차 / 2 : 중앙값임을 확인
features_info = dict()
for col in features_handle :
    temp = []
    temp.append(np.mean(df[col]))
    temp.append(np.std(df[col]))
    temp.append(np.median(df[col]))
    features_info[col] = temp
# print(features_info)

### 상관관계가 높은 자외선 변수들 처리

In [34]:
ultra = df.loc[:,['psfMag_u','fiberMag_u','petroMag_u']]
df['average_ultra'] = ultra.apply(np.mean,axis=1)
df = df.drop(['psfMag_u','fiberMag_u','petroMag_u'],axis=1)
df.head()

Unnamed: 0,id,type,fiberID,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_g,fiberMag_r,fiberMag_i,...,petroMag_g,petroMag_r,petroMag_i,petroMag_z,modelMag_u,modelMag_g,modelMag_r,modelMag_i,modelMag_z,average_ultra
0,0,QSO,601,21.431953,21.314148,21.176553,21.171444,21.644453,21.657571,21.387653,...,21.431636,21.478312,21.145409,20.422446,22.749241,21.465534,21.364187,21.020605,21.14734,22.761283
1,1,QSO,788,20.708104,20.67885,20.70342,20.473229,21.029773,20.967054,20.937731,...,20.778968,20.889705,20.639812,20.64666,21.492955,20.758527,20.753925,20.693389,20.512314,21.553618
2,2,QSO,427,16.727898,16.679677,16.69464,16.641788,17.033098,16.999682,17.095999,...,16.738784,16.688874,16.74421,16.808006,17.818063,16.697434,16.641249,16.660177,16.688928,17.963531
3,3,QSO,864,20.040371,19.926909,19.84384,19.46327,20.317165,20.217898,20.073852,...,19.993727,19.985531,19.750917,19.455117,20.770711,20.001699,19.889798,19.758113,19.552855,20.754279
4,4,STAR_RED_DWARF,612,23.058767,21.471406,19.504961,18.389096,23.629122,21.74275,19.861718,...,22.426929,21.673551,19.610012,18.376141,24.877052,23.147993,21.475342,19.48733,18.375655,26.004943


### std를 이용한 outlier 선택 및 oulier 의 유의미성 확인
1. 아웃라이어를 선정할때 사분위 수가 아닌 표준편차를 이용하여 유의미한 차이를 나타내는 부분에서 부터 아웃라이어를 정한다
2. 아웃라이어가 정보를 가지고 있다는 것은 다음을 의미한다.
  * 특정 type과 아웃라이어의 상관관계가 깊다.
  * 특정 tpye은 아웃라이어에 전혀 영향이 없다.
  

평범한 정규 분포에서 

1. one Standard Deviation from the Mean: 68%
2. two Standard Deviations from the Mean: 95%
3. three Standard Deviations from the Mean: 99.7%

변수 별로 n standard deviations 을 아웃라이어 기준을 잡고
각 타입별로 아웃라이어 해당하는 비율이 어느정도 인지 파악해보자

In [35]:
# 표준 편차를 이용한 tpye별 outlier 현황 


def std_outlier(n, df) :
    '''
    param : # of std, df 
    return : dataframe of correltion of type&outlier 
    '''
    result = df.groupby("type").size().reset_index(name = "count")
    cols = df.columns[3:]
    for col in cols :
        mean = np.mean(df[col])
        std = np.std(df[col])
        temp = df[['type', col]]
        temp = temp[(temp[col] < (mean - n*std)) | (temp[col] > (mean + n*std))]
        new_col_name = "{}_Orate".format(col)
        temp = temp.groupby('type').size().reset_index(name=new_col_name)
        result = result.merge(temp, how="left", on="type")
        result = result.fillna(0)
        result[new_col_name] = result[new_col_name] / result["count"]
    return result

In [36]:
std_outlier(0.5,df)

Unnamed: 0,type,count,psfMag_g_Orate,psfMag_r_Orate,psfMag_i_Orate,psfMag_z_Orate,fiberMag_g_Orate,fiberMag_r_Orate,fiberMag_i_Orate,fiberMag_z_Orate,petroMag_g_Orate,petroMag_r_Orate,petroMag_i_Orate,petroMag_z_Orate,modelMag_u_Orate,modelMag_g_Orate,modelMag_r_Orate,modelMag_i_Orate,modelMag_z_Orate,average_ultra_Orate
0,GALAXY,37347,0.000187,0.000214,0.000134,0.000161,0.000107,5.4e-05,5.4e-05,0.000107,0.000268,0.000321,0.000134,0.000428,0.000107,0.000107,5.4e-05,5.4e-05,0.000107,0.000134
1,QSO,49680,0.0,0.0,0.0,4e-05,4e-05,0.0,0.0,2e-05,6e-05,0.000121,4e-05,0.000121,0.0,0.0,0.0,0.0,0.0,2e-05
2,REDDEN_STD,14618,0.000137,6.8e-05,0.000137,0.000137,0.000137,6.8e-05,0.000137,0.000137,0.000137,6.8e-05,0.000137,0.000205,0.000137,0.000137,6.8e-05,0.000137,0.000137,0.000137
3,ROSAT_D,6580,0.000304,0.000304,0.000304,0.000304,0.000304,0.000304,0.000304,0.000456,0.001672,0.001672,0.000912,0.00152,0.000304,0.000304,0.000304,0.000304,0.000304,0.000304
4,SERENDIPITY_BLUE,21760,0.000184,0.000184,0.000184,0.000184,0.000184,0.000184,0.000184,0.000184,0.000276,0.000184,0.00023,0.00023,0.000184,0.000184,0.000184,0.000184,0.000184,0.000184
5,SERENDIPITY_DISTANT,4654,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215,0.000215
6,SERENDIPITY_FIRST,7132,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,SERENDIPITY_MANUAL,61,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,SERENDIPITY_RED,2562,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039,0.000781,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039,0.00039
9,SKY,127,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.007874,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### 결과 : 

outlier 에 사용하는 표준편차의 수준을 매우 낮게 줄여야지만 결과가 확인 가능하다.
사실상 oulier에 의해 표준편차도 매우 큰 영향을 받기 때문에 std 를 이용한 oulier 확인은 그닥 의미가 없어보인다
데이터 값의 개수와 그 값의 차이를 생각하여 outlier를 확인해야 할 듯 하다.

### 데이터의 비율을 이용한 outlier 선택 및 oulier 의 유의미성 확인

In [37]:
# 비율을 이용한 tpye별 outlier 현황 


def rate_outlier(n, df) :
    '''
    param : n -> degree of outrate, df -> orb dataframe 
    return : dataframe of correltion of type&outlier 
    '''
    result = df.groupby("type").size().reset_index(name = "count")
    cols = df.columns[3:]
    for col in cols :
        Ql = df[col].quantile(n)
        Qr= df[col].quantile(1-n)
        IQR = Qr - Ql
        temp = df[['type', col]]
        temp = temp[ (temp[col] < (Ql - 1.5 * IQR)) | (temp[col] > (Qr + 1.5 * IQR))]
        new_col_name = "{}_Orate".format(col)
        temp = temp.groupby('type').size().reset_index(name=new_col_name)
        result = result.merge(temp, how="left", on="type")
        result = result.fillna(0)
        result[new_col_name] = result[new_col_name] / result["count"]
    return result

In [38]:
rate_outlier(0.25, df)

Unnamed: 0,type,count,psfMag_g_Orate,psfMag_r_Orate,psfMag_i_Orate,psfMag_z_Orate,fiberMag_g_Orate,fiberMag_r_Orate,fiberMag_i_Orate,fiberMag_z_Orate,petroMag_g_Orate,petroMag_r_Orate,petroMag_i_Orate,petroMag_z_Orate,modelMag_u_Orate,modelMag_g_Orate,modelMag_r_Orate,modelMag_i_Orate,modelMag_z_Orate,average_ultra_Orate
0,GALAXY,37347,0.00257,0.00174,0.001714,0.001392,0.001419,0.000455,0.000536,0.000562,0.005891,0.002945,0.002678,0.004177,0.013388,0.001714,0.001285,0.001312,0.001017,0.010925
1,QSO,49680,0.001188,0.001671,0.000523,0.003885,0.000926,0.000624,0.000403,0.002818,0.002415,0.002033,0.006683,0.026389,0.012943,0.000604,0.000765,0.000141,0.000483,0.009783
2,REDDEN_STD,14618,0.000205,0.000205,0.000137,0.000137,0.000137,0.000137,0.000137,0.000137,0.000205,0.000205,0.000205,0.000274,0.000137,0.000137,0.000137,0.000137,0.000137,0.000137
3,ROSAT_D,6580,0.014134,0.008055,0.006535,0.006383,0.002736,0.001824,0.00076,0.00228,0.019909,0.010486,0.008511,0.014894,0.027964,0.007903,0.003343,0.003647,0.00304,0.023404
4,SERENDIPITY_BLUE,21760,0.000551,0.000414,0.000781,0.000551,0.00023,0.00023,0.000184,0.000368,0.000506,0.000689,0.001425,0.005515,0.000414,0.00023,0.000368,0.000827,0.000276,0.000414
5,SERENDIPITY_DISTANT,4654,0.005157,0.00043,0.000859,0.00043,0.002364,0.000215,0.000215,0.00043,0.000645,0.00043,0.002149,0.007306,0.002578,0.000215,0.000215,0.000645,0.000215,0.006876
6,SERENDIPITY_FIRST,7132,0.00028,0.0,0.00028,0.00014,0.000421,0.0,0.0,0.0,0.012619,0.00014,0.000841,0.001683,0.057347,0.00014,0.0,0.00014,0.0,0.036175
7,SERENDIPITY_MANUAL,61,0.04918,0.0,0.0,0.0,0.032787,0.016393,0.0,0.0,0.032787,0.0,0.0,0.0,0.032787,0.032787,0.0,0.0,0.0,0.032787
8,SERENDIPITY_RED,2562,0.229508,0.026151,0.001171,0.000781,0.215457,0.043716,0.001171,0.00039,0.258392,0.094067,0.006245,0.000781,0.118267,0.12256,0.005855,0.001171,0.00039,0.087041
9,SKY,127,0.283465,0.370079,0.338583,0.141732,0.283465,0.330709,0.291339,0.102362,0.314961,0.377953,0.346457,0.188976,0.062992,0.220472,0.212598,0.204724,0.062992,0.062992


#### 결과 :
 위 테이블을 확인해 보면 아웃라이어의 분포가 고르다고 할 수 없다. 몇몇 타입의 천체들은 다른 타입의 천체들에 비해
아웃라이어의 비율이 상당히 높은 것을 나타난다.
이는 특히 type의 개수가 작은 천체들에서 나타나기 때문에 아웃라이어의 제거가 특정 type에 대한 데이터의 삭제라는 결과로 이어질 수 있다.


#### 의미 :  
 아웃라이어를 제거하는 것은 모델의 학습에 악영향을 미칠것이다. 하지만 너무 극단적인 아웃라이어나, 너무 정규화되지 않은 데이터들도 학습에 악영향을 미친다. 아웃라이어를 제거하면서, 데이터들을 정규화 하는 방법을 찾아야한다.

### 가설1

#### 내용 :

특정한 특징의 아웃라이어를 제거한다면 그 특징의 아웃라이에 많은 분포를 이루고 있던 타입에 대한 예측력이 떨어지게된다.
하지만 아웃라이어를 제거하면서 원래 아웃라이어를 가지고 있었다는 정보를 만들어 전달해주면 학습하는 과정에서 아웃라이어의 존재를 인식하지 않을까 싶다.

#### 과제 :
"특정한 특징의 아웃라이어를 제거하면서" 와 "아웃라이어의 존재를 인식" 이라는 두가지 과제를 달성해야 긍정적인 영향을 가져올 있다. 

#### 진행방향 :
특정한 특징의 아웃라이어를 중심으로한 분포를 새로 작성하여 하나의 특징을 두개이상의 분포로 나누어 학습에 이용하자!


In [78]:
temp1 = df[(df["type"]=="SKY")|(df["type"]=="SERENDIPITY_RED")|(df["type"]=="STAR_PN")]

In [79]:
temp2 = df[(df["type"]!="SKY")&(df["type"]!="SERENDIPITY_RED")&(df["type"]!="STAR_PN")]

In [95]:
len(df)

199991

In [105]:
train = df.iloc[:180000, :]
test = df.iloc[180001:,:]

In [108]:
QSO = train[train['type']=='QSO'] #가장 갯수가 많은 QSO부터 시작
# QSO.describe() fiberMag만 이상한 것을 확인할 수 있다. 이친구의 outlier 먼저 제거해준다.

In [109]:
q1,q3 = np.percentile(QSO['fiberMag_g'],[25,75])
iqr = q3-q1
lower_bound = q1 - (iqr*1.5)
upper_bound = q1 + (iqr*1.5)
QSO = QSO[(QSO['fiberMag_g']<upper_bound)&(QSO['fiberMag_g']>lower_bound)]
len(QSO)

42483

In [111]:
train_no_qso = train[train['type']!='QSO']
train_yes_qso = pd.concat([train_no_qso, QSO])

X_train = train_yes_qso.drop('type', axis = 1)
y_train = train_yes_qso['type']
X_test = test.drop("type", axis = 1)
y_test = test["type"]

In [112]:
lgbm_model = LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=10,
                                learning_rate=0.1, n_estimators=2000, max_depth=15,
                                bagging_fraction=0.9, feature_fraction=0.9, reg_lambda=0.2)
lgbm_model.fit(X_train,y_train)

LGBMClassifier(bagging_fraction=0.9, boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, feature_fraction=0.9,
        importance_type='split', learning_rate=0.1, max_depth=15,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=2000, n_jobs=-1, num_leaves=10, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.2, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [113]:
y_pred_lgbm_prob = lgbm_model.predict_proba(X_test)
log_loss(y_true=y_test, y_pred=y_pred_lgbm_prob)

0.46518798725091387

In [None]:
QSO = df[['type']=='QSO'] #가장 갯수가 많은 QSO부터 시작
QSO.describe() #fiberMag만 이상한 것을 확인할 수 있다. 이친구의 outlier 먼저 제거해준다.
len(QSO)

In [81]:
q1,q3 = np.percentile(QSO['fiberMag_g'],[25,75])
iqr = q3-q1
lower_bound = q1 - (iqr*1.5)
upper_bound = q1 + (iqr*1.5)
QSO = QSO[(QSO['fiberMag_g']<upper_bound)&(QSO['fiberMag_g']>lower_bound)]
len(QSO)
QSO.describe()

Unnamed: 0,id,fiberID,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_g,fiberMag_r,fiberMag_i,fiberMag_z,petroMag_g,petroMag_r,petroMag_i,petroMag_z,modelMag_u,modelMag_g,modelMag_r,modelMag_i,modelMag_z,average_ultra
count,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0,47286.0
mean,99993.526245,478.965296,20.865241,20.599525,20.43286,20.295372,21.139228,20.869461,20.705671,20.537495,20.886505,20.615675,20.513924,20.430968,21.722546,20.818313,20.540412,20.378269,20.253305,-26.67231
std,57880.184193,284.432137,0.991491,1.046942,1.08448,1.146424,0.96766,1.027858,1.073969,1.130565,1.117024,1.471667,1.550934,2.41223,1.522757,1.009358,1.045582,1.07669,1.153766,10541.51
min,0.0,1.0,11.287406,0.81435,-30.71189,7.074244,18.355912,15.371597,10.646672,3.203617,-29.679836,-130.554426,-23.732019,-167.577528,14.857191,14.817844,15.261008,14.511502,12.235952,-2292266.0
25%,49722.75,236.0,20.270334,19.9493,19.770406,19.612727,20.539596,20.217643,20.017915,19.844226,20.276049,19.958889,19.777236,19.513181,20.833752,20.243712,19.930313,19.747622,19.592882,20.94852
50%,100367.0,468.0,21.060135,20.76822,20.612329,20.406412,21.340313,21.041096,20.878141,20.628,21.082347,20.776788,20.628045,20.316759,21.666681,21.033363,20.736684,20.576644,20.389728,21.77314
75%,150108.5,718.0,21.611292,21.368975,21.22166,21.013908,21.898589,21.643083,21.496374,21.248565,21.64422,21.399372,21.278272,21.131502,22.435647,21.572006,21.318407,21.175064,20.985115,22.52433
max,199990.0,1000.0,50.646416,50.849271,61.241476,81.34568,22.650947,31.928244,35.074377,40.860707,66.058291,111.756603,122.271336,290.639685,28.911952,32.272511,33.666494,29.28047,48.276514,173.9849


In [82]:
temp_no_qso = df[df['type']!='QSO']
temp_yes_qso = pd.concat([temp_no_qso, QSO])

X = temp_yes_qso.drop('type', axis = 1)
y = temp_yes_qso['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [66]:
lgbm_model = LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=10,
                                learning_rate=0.1, n_estimators=2000, max_depth=15,
                                bagging_fraction=0.9, feature_fraction=0.9, reg_lambda=0.2)
lgbm_model.fit(X_train,y_train)

LGBMClassifier(bagging_fraction=0.9, boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, feature_fraction=0.9,
        importance_type='split', learning_rate=0.1, max_depth=15,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=2000, n_jobs=-1, num_leaves=10, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.2, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [83]:
y_pred_lgbm_prob = lgbm_model.predict_proba(X_test)
log_loss(y_true=y_test, y_pred=y_pred_lgbm_prob)

0.273768004369504

In [87]:
galaxy = temp_yes_qso[temp_yes_qso['type']=='GALAXY'] #그 다음 갯수가 많은 GALAXY로 시작

In [88]:
q1,q3 = np.percentile(galaxy['average_ultra'],[25,75])
iqr = q3-q1
lower_bound = q1 - (iqr*1.5)
upper_bound = q1 + (iqr*1.5)
galaxy = galaxy[(galaxy['average_ultra']<upper_bound)&(galaxy['average_ultra']>lower_bound)]


31209

In [89]:
temp_no_gl = temp_yes_qso[temp_yes_qso['type']!='GALAXY']
temp_yes_gl = pd.concat([temp_no_gl, galaxy])

31209

In [91]:
X = temp_yes_gl.drop('type', axis = 1)
y = temp_yes_gl['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [92]:
lgbm_model = LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=10,
                                learning_rate=0.1, n_estimators=2000, max_depth=15,
                                bagging_fraction=0.9, feature_fraction=0.9, reg_lambda=0.2)
lgbm_model.fit(X_train,y_train)

LGBMClassifier(bagging_fraction=0.9, boosting_type='gbdt', class_weight=None,
        colsample_bytree=1.0, feature_fraction=0.9,
        importance_type='split', learning_rate=0.1, max_depth=15,
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
        n_estimators=2000, n_jobs=-1, num_leaves=10, objective='binary',
        random_state=None, reg_alpha=0.0, reg_lambda=0.2, silent=True,
        subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [93]:
y_pred_lgbm_prob = lgbm_model.predict_proba(X_test)
log_loss(y_true=y_test, y_pred=y_pred_lgbm_prob)

0.36808338732731755

### 정열이형이 한 방식 시용!

In [39]:
# from sklearn.preprocessing import RobustScaler
# rb_scaler = RobustScaler()
# # type과 id를 제외하고 학습
# data_for_scaling = df.iloc[:,3:]
# rb_scaler.fit(data_for_scaling)
# # 학습후 변환
# train_rbscaled = rb_scaler.transform(data_for_scaling)
# train_rbscaled
# # 학습후 변환한 데이터를 다시 원래 데이터로 만들기
# data_rbscaled = pd.DataFrame(train_rbscaled, columns = data_for_scaling.columns)
# data_rbscaled = pd.concat([df[['id','type', 'fiberID']], data_rbscaled], axis=1)
# data_rbscaled.head()


In [40]:
# features = data_rbscaled.columns[3:]

# for col in features :
#     plt.figure(figsize=(12,4))
#     sns.distplot(data_rbscaled[col])
#     plt.title('Distribution of %s\n'%col)

In [41]:
# X = data_rbscaled.drop('type', axis = 1)
# y = data_rbscaled['type']
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [42]:
# lgbm_model = LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=10,
#                                 learning_rate=0.1, n_estimators=2000, max_depth=15,
#                                 bagging_fraction=0.9, feature_fraction=0.9, reg_lambda=0.2)
# lgbm_model.fit(X_train,y_train)
# pickle.dump(lgbm_model, open('./models/nm2.sav', 'wb'))

In [29]:
# y_pred_lgbm_prob = lgbm_model.predict_proba(X_test)
# log_loss(y_true=y_test, y_pred=y_pred_lgbm_prob)

0.3912909609266363

In [134]:
QSO = df[df['type']=='QSO'] #가장 갯수가 많은 QSO부터 시작
QSO.describe()

Unnamed: 0,id,fiberID,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_g,fiberMag_r,fiberMag_i,fiberMag_z,petroMag_g,petroMag_r,petroMag_i,petroMag_z,modelMag_u,modelMag_g,modelMag_r,modelMag_i,modelMag_z,average_ultra
count,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0,49680.0
mean,99990.617673,475.96254,20.844459,20.562275,20.389592,20.245893,21.771637,20.831491,20.663179,20.487626,20.875612,20.582023,20.470759,20.381006,21.711196,20.796655,20.501448,20.331913,20.2052,-24.34714
std,57856.805308,283.314172,1.144866,1.16729,1.188138,1.37071,1475.434584,1.158578,1.187628,1.426622,2.104052,1.756144,1.646889,2.695318,1.634135,1.160741,1.169228,1.189599,1.259779,10284.38
min,0.0,1.0,8.420418,0.81435,-30.71189,-105.351185,-215882.917191,-14.662435,10.646672,-139.831165,-198.87644,-130.554426,-23.732019,-192.700406,14.379808,13.881702,13.832868,13.662449,12.235952,-2292266.0
25%,49800.0,234.0,20.232979,19.914293,19.725654,19.557763,20.503032,20.177854,19.970901,19.794878,20.240878,19.923213,19.727318,19.461315,20.797072,20.209107,19.894231,19.70075,19.539938,20.91746
50%,100307.5,464.0,21.071196,20.773945,20.604364,20.398573,21.351912,21.046853,20.870056,20.618591,21.093457,20.782893,20.620931,20.306151,21.680015,21.044267,20.743892,20.568844,20.382068,21.78486
75%,150066.0,711.0,21.646985,21.394157,21.236891,21.024761,21.934979,21.667805,21.512357,21.257899,21.684064,21.423231,21.29447,21.138621,22.477501,21.609808,21.343598,21.191318,20.998438,22.56705
max,199990.0,1000.0,50.646416,50.849271,61.241476,81.34568,248077.51338,32.574603,42.002047,40.860707,270.432463,181.955644,122.271336,290.639685,39.238248,32.272511,33.666494,29.28047,48.276514,173.9849


In [135]:
q1,q3 = np.percentile(df['fiberMag_g'],[25,75])
iqr = q3-q1
lower_bound = q1 - (iqr*1.5)
upper_bound = q1 + (iqr*1.5)
df = df[(df['fiberMag_g']<upper_bound)&(df['fiberMag_g']>lower_bound)]
df.describe()

Unnamed: 0,id,fiberID,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_g,fiberMag_r,fiberMag_i,fiberMag_z,petroMag_g,petroMag_r,petroMag_i,petroMag_z,modelMag_u,modelMag_g,modelMag_r,modelMag_i,modelMag_z,average_ultra
count,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0,184210.0
mean,99947.112638,363.198382,19.585049,19.074175,18.804311,18.609619,19.804639,19.298945,18.898961,18.819345,19.277933,18.733533,18.508583,18.446053,20.423867,19.23466,18.72457,18.430314,18.25741,8.558393
std,57734.761114,226.82147,2.143509,4.03049,2.154353,1.796107,1.592327,1.587735,46.022662,1.752703,5.217607,14.934792,2.823565,21.391982,1.95168,1.718471,1.752816,1.822879,1.91225,5342.609
min,0.0,1.0,-335.608609,-1276.171892,-30.71189,-106.927107,15.179296,10.227049,-19721.172166,3.203617,-1581.627834,-4370.038636,-654.195115,-789.601838,3.444448,8.68669,5.781584,5.004407,4.460086,-2292266.0
25%,49956.25,176.0,18.543451,17.947889,17.625236,17.340455,18.786891,18.169328,17.796609,17.516874,18.009494,17.386833,16.960881,16.718953,19.161753,17.971087,17.332162,16.888818,16.625877,19.55761
50%,99922.5,351.0,19.751179,19.286351,18.913761,18.687358,19.906232,19.408728,18.981486,18.753122,19.370772,18.849383,18.37468,18.109284,20.250031,19.329865,18.808439,18.319782,18.036781,20.59465
75%,149913.75,529.0,20.742438,20.268157,20.044743,19.925819,20.963056,20.528167,20.308054,20.187732,20.58868,20.211664,20.014095,19.86743,21.592982,20.542856,20.164154,19.968114,19.882427,21.77006
max,199990.0,1000.0,478.440761,171.321103,585.370349,115.830481,22.627179,31.928244,35.074377,173.583776,773.346459,809.62822,219.701423,8647.360709,49.487277,66.589958,35.102312,45.78355,48.276514,58435.13


#### 중간점검

In [137]:
X = df.drop('type', axis = 1)
y = df['type']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.2, random_state=42)

In [139]:
lgbm_model = LGBMClassifier(boosting_type='gbdt', objective='binary', num_leaves=10,
                                learning_rate=0.1, n_estimators=2000, max_depth=15,
                                bagging_fraction=0.9, feature_fraction=0.9, reg_lambda=0.2)
lgbm_model.fit(X_train,y_train)
pickle.dump(lgbm_model, open('./models/nm1.sav', 'wb'))

In [140]:
y_pred_lgbm_prob = lgbm_model.predict_proba(X_test)
log_loss(y_true=y_test, y_pred=y_pred_lgbm_prob)

0.39463007591794785