In [None]:
from sklearn.linear_model import LogisticRegression
import optuna
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score, average_precision_score, log_loss
from sklearn.model_selection import train_test_split, KFold , StratifiedKFold, cross_val_score
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


In [None]:
def rm(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))    
    for col in df.columns:
        col_type = df[col].dtype        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
train = rm(pd.read_csv("../input/tabular-playground-series-may-2021/train.csv"))
ncol = ['feature_0', 'feature_1', 'feature_2', 'feature_3', 'feature_4',
       'feature_5', 'feature_6', 'feature_7', 'feature_8', 'feature_9',
       'feature_10', 'feature_11', 'feature_12', 'feature_13', 'feature_14',
       'feature_15', 'feature_16', 'feature_17', 'feature_18', 'feature_19',
       'feature_20', 'feature_21', 'feature_22', 'feature_23', 'feature_24',
       'feature_25', 'feature_26', 'feature_27', 'feature_28', 'feature_29',
       'feature_30', 'feature_31', 'feature_32', 'feature_33', 'feature_34',
       'feature_35', 'feature_36', 'feature_37', 'feature_38', 'feature_39',
       'feature_40', 'feature_41', 'feature_42', 'feature_43', 'feature_44',
       'feature_45', 'feature_46', 'feature_47', 'feature_48', 'feature_49' ]
sample = rm(pd.read_csv("../input/tabular-playground-series-may-2021/sample_submission.csv"))
test = rm(pd.read_csv("../input/tabular-playground-series-may-2021/test.csv"))
to_test = test[ncol]

In [None]:
df = pd.concat([train]).reset_index(drop = True)  # for future use
df = df.drop('id', axis=1)
onehot_cols = ['target']
onehot_encoded_df = pd.get_dummies(df[onehot_cols])
df = pd.concat([df[ncol], onehot_encoded_df], axis=1)
X = df[ncol]
y_cols = ['target_Class_1', 'target_Class_2', 'target_Class_3', 'target_Class_4']
y = np.stack([df[c] for c in y_cols]).T
y = pd.DataFrame(y)
del train
del test

In [None]:
# class 1 
def objective(trial , X = X , y = y[0]):
    train_x , test_x , train_y , test_y = train_test_split(X , y , \
            test_size = 0.2 , random_state = 42, stratify = y)    
    params = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-2, 1),
       # 'fit_intercept' : trial.suggest_categorical('fit_intercept' , [True, False]),
       #  'random_state' : trial.suggest_categorical('random_state' , [0, 42, 2021, 555]),
       # 'solver' : trial.suggest_categorical('solver' , ['lbfgs','liblinear']),
        "n_jobs" : -1
    }
    model1 = LogisticRegression(**params, random_state = 2020)
    model1.fit(train_x , train_y)
    y_predlr1 = model1.predict_proba(test_x)[:,1]
    model2 = LogisticRegression(**params, random_state = 2021)
    model2.fit(train_x , train_y)
    y_predlr2 = model2.predict_proba(test_x)[:,1]
    y_predlr = (y_predlr1 + y_predlr2) / 2
    ll = log_loss(test_y , y_predlr)
    return ll
optuna.logging.set_verbosity(optuna.logging.WARNING) # i do not want to see trail information
study = optuna.create_study(direction = 'minimize' , study_name = 'lr'
                            , pruner = optuna.pruners.HyperbandPruner() 
                           )
study.optimize(objective, n_trials = 500)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)

In [None]:
p_class1 = {'tol': 8.48578916594731e-05, 'C': 0.010068250605161195, "n_jobs" : -1}
#the best value: 0.29062582857124236
    
lr_class11 = LogisticRegression(**p_class1, random_state = 2020)
lr_class11.fit(X, y[0])
pred_class11 = lr_class11.predict_proba(to_test)[:,1]
lr_class12 = LogisticRegression(**p_class1, random_state = 2021)
lr_class12.fit(X, y[0])
pred_class12 = lr_class12.predict_proba(to_test)[:,1]
pred_class1 = (pred_class11 + pred_class12) / 2

In [None]:
# class 2
def objective(trial , X = X , y = y[1]):
    train_x , test_x , train_y , test_y = train_test_split(X , y , \
            test_size = 0.2 , random_state = 42, stratify = y)    
    params = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-2, 1),
       # 'fit_intercept' : trial.suggest_categorical('fit_intercept' , [True, False]),
       #  'random_state' : trial.suggest_categorical('random_state' , [0, 42, 2021, 555]),
       # 'solver' : trial.suggest_categorical('solver' , ['lbfgs','liblinear']),
        "n_jobs" : -1
    }
    model1 = LogisticRegression(**params, random_state = 2020)
    model1.fit(train_x , train_y)
    y_predlr1 = model1.predict_proba(test_x)[:,1]
    model2 = LogisticRegression(**params, random_state = 2021)
    model2.fit(train_x , train_y)
    y_predlr2 = model2.predict_proba(test_x)[:,1]
    y_predlr = (y_predlr1 + y_predlr2) / 2
    ll = log_loss(test_y , y_predlr)
    return ll
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction = 'minimize' , study_name = 'lr'
                            , pruner = optuna.pruners.HyperbandPruner()
                           )
study.optimize(objective, n_trials = 500)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)


In [None]:
p_class2 = {'tol': 0.0005028269911647464, 'C': 0.010143399742532085, "n_jobs" : -1}
# the best value: 0.6775994526616997

lr_class21 = LogisticRegression(**p_class2, random_state = 2020)
lr_class21.fit(X, y[1])
pred_class21 = lr_class21.predict_proba(to_test)[:,1]
lr_class22 = LogisticRegression(**p_class2, random_state = 2021)
lr_class22.fit(X, y[1])
pred_class22 = lr_class22.predict_proba(to_test)[:,1]
pred_class2 = (pred_class21 + pred_class22) / 2

In [None]:
# class 3
def objective(trial , X = X , y = y[2]):
    train_x , test_x , train_y , test_y = train_test_split(X , y , \
            test_size = 0.2 , random_state = 42, stratify = y)    
    params = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-2, 1),
       # 'fit_intercept' : trial.suggest_categorical('fit_intercept' , [True, False]),
       #  'random_state' : trial.suggest_categorical('random_state' , [0, 42, 2021, 555]),
       # 'solver' : trial.suggest_categorical('solver' , ['lbfgs','liblinear']),
        "n_jobs" : -1
    }
    model1 = LogisticRegression(**params, random_state = 2020)
    model1.fit(train_x , train_y)
    y_predlr1 = model1.predict_proba(test_x)[:,1]
    model2 = LogisticRegression(**params, random_state = 2021)
    model2.fit(train_x , train_y)
    y_predlr2 = model2.predict_proba(test_x)[:,1]
    y_predlr = (y_predlr1 + y_predlr2) / 2
    ll = log_loss(test_y , y_predlr)
    return ll
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction = 'minimize' , study_name = 'lr'
                            , pruner = optuna.pruners.HyperbandPruner()
                           )
study.optimize(objective, n_trials = 500)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)

In [None]:
p_class3 =  {'tol': 0.0005335002526146307, 'C': 0.5980679171662785, "n_jobs" : -1}
# the best value: 0.5131335576493116

lr_class31 = LogisticRegression(**p_class3, random_state = 2020)
lr_class31.fit(X, y[2])
pred_class31 = lr_class31.predict_proba(to_test)[:,1]
lr_class32 = LogisticRegression(**p_class3, random_state = 2021)
lr_class32.fit(X, y[2])
pred_class32 = lr_class32.predict_proba(to_test)[:,1]
pred_class3 = (pred_class31 + pred_class32) / 2

In [None]:
# class 4
def objective(trial , X = X , y = y[3]):
    train_x , test_x , train_y , test_y = train_test_split(X , y , \
            test_size = 0.2 , random_state = 42, stratify = y)    
    params = {
        'tol' : trial.suggest_uniform('tol' , 1e-6 , 1e-3),
        'C' : trial.suggest_loguniform("C", 1e-2, 1),
       # 'fit_intercept' : trial.suggest_categorical('fit_intercept' , [True, False]),
       #  'random_state' : trial.suggest_categorical('random_state' , [0, 42, 2021, 555]),
       # 'solver' : trial.suggest_categorical('solver' , ['lbfgs','liblinear']),
        "n_jobs" : -1
    }
    model1 = LogisticRegression(**params, random_state = 2020)
    model1.fit(train_x , train_y)
    y_predlr1 = model1.predict_proba(test_x)[:,1]
    model2 = LogisticRegression(**params, random_state = 2021)
    model2.fit(train_x , train_y)
    y_predlr2 = model2.predict_proba(test_x)[:,1]
    y_predlr = (y_predlr1 + y_predlr2) / 2
    ll = log_loss(test_y , y_predlr)
    return ll
optuna.logging.set_verbosity(optuna.logging.WARNING)
study = optuna.create_study(direction = 'minimize' , study_name = 'lr'
                            , pruner = optuna.pruners.HyperbandPruner()
                           )
study.optimize(objective, n_trials = 500)
print('numbers of the finished trials:' , len(study.trials))
print('the best params:' , study.best_trial.params)
print('the best value:' , study.best_value)

In [None]:
p_class4 = {'tol': 0.00012534268581491389, 'C': 0.010057756278091795, "n_jobs" : -1}
# the best value: 0.374101486607313

lr_class41 = LogisticRegression(**p_class4, random_state = 2020)
lr_class41.fit(X, y[3])
pred_class41 = lr_class41.predict_proba(to_test)[:,1]
lr_class42 = LogisticRegression(**p_class4, random_state = 2021)
lr_class42.fit(X, y[3])
pred_class42 = lr_class42.predict_proba(to_test)[:,1]
pred_class4 = (pred_class41 + pred_class42) / 2

In [None]:
pred_all = pd.concat([pd.DataFrame(pred_class1), pd.DataFrame(pred_class2), 
           pd.DataFrame(pred_class3), pd.DataFrame(pred_class4)], axis=1)
sample[['Class_1','Class_2', 'Class_3', 'Class_4']] = pred_all.to_numpy()
sample.to_csv('output.csv',index=False)
sample