In [1]:
import pandas as pd
import numpy as np

In [2]:
train = pd.read_csv('data/ybigta_sdss_train.csv', index_col=0)
test = pd.read_csv('data/ybigta_sdss_test.csv', index_col=0)
sample_submission = pd.read_csv('data/ybigta_sdss_sample_submission.csv', index_col=0)

In [3]:
column_number = {}
for i, column in enumerate(sample_submission.columns):
    column_number[column] = i
    
def to_number(x, dic):
    return dic[x]

train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number))

In [4]:
#Robust scaling

from sklearn.preprocessing import RobustScaler

columns = train.columns[2:22]
rb_scaler = RobustScaler()
train_test = pd.concat((train.iloc[:, 2:22], test.iloc[:, 1:]), axis=0)
train_test = pd.DataFrame(rb_scaler.fit_transform(train_test), columns=columns, 
                          index=(list(train.index) + list(test.index)))

train_scaled = train_test.iloc[:len(train.index), :]
test_scaled = train_test.iloc[len(train.index):, :]
train = pd.concat((train.iloc[:, 0:2], train_scaled, train.iloc[:, 22]), axis=1)
test = pd.concat((test.iloc[:, 0], test_scaled, test.iloc[:, 20]), axis=1)

train.head()

Unnamed: 0_level_0,type,fiberID,psfMag_u,psfMag_g,psfMag_r,psfMag_i,psfMag_z,fiberMag_u,fiberMag_g,fiberMag_r,...,petroMag_g,petroMag_r,petroMag_i,petroMag_z,modelMag_u,modelMag_g,modelMag_r,modelMag_i,modelMag_z,type_num
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
415567,QSO,106,-0.435873,-0.293006,-0.061893,-0.112365,0.045592,-0.526241,-0.470618,-0.318998,...,-0.809223,-0.702594,-0.717329,-0.654126,-1.075748,-0.952455,-0.802477,-0.819242,-0.726988,8
733874,QSO,492,-0.58026,-0.415342,-0.221442,-0.113627,0.093325,-0.599696,-0.370296,-0.123794,...,-0.126247,0.071913,0.148203,0.294512,-0.464211,-0.12742,0.074672,0.154372,0.284371,8
1009150,QSO,388,-0.185381,-0.378678,-0.230361,-0.128357,0.039933,-0.152541,-0.316298,-0.108718,...,-0.09651,0.082277,0.146719,0.250467,0.010117,-0.096402,0.073088,0.146803,0.246988,8
803041,QSO,531,0.616548,0.437376,0.608051,0.845455,0.852428,0.74232,0.545376,0.766557,...,0.613879,0.794042,0.92319,0.969802,0.966397,0.614955,0.784281,0.917421,0.897889,8
432241,QSO,180,0.929133,0.371992,0.167015,0.201739,0.223452,1.036929,0.461316,0.286541,...,0.573924,0.408006,0.412831,0.362005,1.36743,0.561257,0.403609,0.396945,0.384822,8


In [5]:
train_x = train.drop(columns=['type', 'type_num'], axis=1)
train_y = train['type_num']
test_x = test

In [None]:
#Find the optimal LGBM model with RandomCV

from sklearn.model_selection import RandomizedSearchCV
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier(n_jobs=-1, device='gpu')
boosting_type = ['gbdt']
num_leaves = [31,63,127]
n_estimators = [1000, 1250, 1500]
learning_rate = [0.1, 0.05, 0.001]

parameter = {'num_leaves':num_leaves,
            'boosting_type':boosting_type,
            'n_estimators':n_estimators,
             'learning_rate':learning_rate}

rs_lgbm = RandomizedSearchCV(estimator=lgbm, 
                             param_distributions=parameter, 
                             scoring='neg_log_loss',
                             n_jobs=-1)

rs_lgbm.fit(train_x, train_y)

In [None]:
#Optimal LGBM model

print(rs_lgbm.best_estimator_)
print(rs_lgbm.best_score_)

In [None]:
#Find the optimal Randomforest model with RandomCV

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1)
criterion = ['gene', 'entropy']
num_leaves = [31,63,127]
n_estimators = [1000, 1500, 2000]
learning_rate = [0.1, 0.05, 0.001]
parameter = {'criterion':criterion,
            'num_leaves':num_leaves
            'n_estimators':n_estimators,
             'learning_rate':learning_rate}

rs_rf = RandomizedSearchCV(estimator=lgbm, 
                             param_distributions=parameter, 
                             scoring='neg_log_loss',
                             n_jobs=-1)

rs_lgbm.fit(train_x, train_y)

In [None]:
#The optimal Randomforest model

print(rs_rf.best_estimator_)
print(rs_rf.best_score_)

In [None]:
#Find the optimal SVM model with RandomCV

from sklearn.model_selection import RandomizedSearchCV
from sklearn.svm import SVC

svm = SVC(probability=True, n_jobs=-1)
kernel = ['linear', 'poly', 'rbf']
C = [0.1, 1, 10, 100]
gamma = [0.1, 1, 10, 100]

parameter = {'kernel':kernel,
            'C':C,
            'gamma':gamma}

rs_svm = RandomizedSearchCV(estimator=svm, 
                             param_distributions=parameter, 
                             scoring='neg_log_loss',
                             n_jobs=-1)


rs_svm.fit(train_x, train_y)

In [None]:
#The optimal SVM model

print(rs_svm.best_estimator_)
print(rs_svm.best_score_)

In [22]:
#Stacking the data based on stratified cross-validation

from sklearn.model_selection import StratifiedKFold

def get_stacking_data(model, X_train, y_train, X_test, n_folds=5):
    stkf = StratifiedKFold(n_splits=n_folds)
    
    train_fold_pred = np.zeros((X_train.shape[0],1))
    test_pred = np.zeros((X_test.shape[0], n_folds))
    print("model :", model.__class__.__name__)
    
    for i, (train_index, valid_index) in enumerate(stkf.split(X_train, y_train)):
        X_train_fold = X_train.loc[train_index]
        y_train_fold = y_train.loc[train_index]
        X_val = X_train.loc[valid_index]
        
        model.fit(X_train_fold, y_train_fold)
        train_fold_pred[valid_index, :] = model.predict(X_val).reshape(-1, 1)
        test_pred[:, i] = model.predict(X_test)
        
    test_pred_mean = np.mean(test_predict, axis=1).reshape(-1,1)
    
    return train_fold_pred, test_pred_mean

In [23]:
#Stack the data with the optimal models

lgbm_opt = LGBMClassifier(n_estimator=200,
                          boosting_type='dart',
                          num_leaves=127)
rf_opt = RandomForestClassifier()
svm_opt = SVC()

lgbm_train, lgbm_test = get_stacking_data(lgbm_opt, train_x, train_y, test_x, n_folds=5)
rf_train, rf_test = get_stacking_data(rf_opt, train_x, train_y, test_x, n_folds=5)
svm_train, svm_test = get_stacking_data(svm_opt, train_x, train_y, test_x, n_folds=5)

stacked_X_train = np.concat((lgbm_train, rf_train, svm_train), axis=1)
stacked_X_test = np.concat((lgbm_test, rf_test, svm_test), axis=1)

model : LGBMClassifier


  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:
  if diff:


NameError: name 'test_predict' is not defined

In [None]:
lgbm.fit(stacked_X_train, y_train)

In [None]:
y_pred = lgbm.predict_proba(test_x)

In [18]:
submission = pd.DataFrame(data=y_pred, columns=sample_submission.columns, index=sample_submission.index)
submission.to_csv('submission.csv', index=True)