<div>
    <h1 align="center">"Optimizing Results"</h1></h1>
    <h1 align="center">Tabular Playground Series - Jan 2021</h1>
</div>

## Import & Data Set

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

%matplotlib inline

# _______________________________________

# Kernels Data (Public Score & File Path)

dfk = pd.DataFrame({ 
    'Kernel ID': ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'k'],  
    'Score':     [ 0.69864 , 0.69846 , 0.69836 , 0.69824 , 0.69813, 0.69795, 0.69782, 0.69756, 0.69747, 0.69735, 0.69731],   
    'File Path': ['../input/tps-jan-2021-gbdts-baseline/submission.csv', '../input/pseudo-labelling/submission.csv', '../input/v4-baseline-lgb-no-tune/sub_0.6971.csv', '../input/tps21-optuna-lgb-fast-hyper-parameter-tunning/submission.csv', '../input/gbdts-baseline-prevision-io-for-free/submission.csv', '../input/v41-eda-gbdts/res41.csv', '../input/v3-ensemble-lgb-xgb-cat/submission.csv' , '../input/tabular-playground/sub_gbm.csv', '../input/v48tabular-playground-series-xgboost-lightgbm/V48-0.69747.csv', '../input/xgboost-hyperparameter-tuning-using-optuna/submission.csv', '../input/tabular-playground-some-slightly-useful-features/sub_gbm.csv']     
})    
    
dfk         

## Functions:

In [None]:
def generate(main, support, coeff):
    
    g = main.copy()    
    
    # loop target values
    for i in main.columns[1:]:
        res = []
        target_main, target_support = [], []        
        target_main = main[i].tolist()
        target_support = support[i].tolist()  
        
        for j in range(len(main)):
            res.append((target_main[j] * coeff) + (target_support[j] * (1.- coeff)))            
        g[i] = res
        
    return g

In [None]:
def drawing(main, support, generated):
    
    X  = main.iloc[:, 1]
    Y1 = support.iloc[:, 1]
    Y2 = generated.iloc[:, 1]
    
    plt.style.use('seaborn-whitegrid') 
    plt.figure(figsize=(8, 8), facecolor='lightgray')
    plt.title(f'\nOn the X axis >>> main\nOn the Y axis >>> support\n')           
    plt.scatter(X, Y1, s=0.1)
    plt.show() 
    
    plt.style.use('seaborn-whitegrid') 
    plt.figure(figsize=(8, 8), facecolor='lightgray')
    plt.title(f'\nOn the X axis >>> main\nOn the Y axis >>> generated\n')           
    plt.scatter(X, Y2, s=0.1)
    plt.show()     
    

In [None]:
def drawing1(main, support, generated):
    
    X  = main.iloc[:, 1]
    Y1 = support.iloc[:, 1]
    Y2 = generated.iloc[:, 1]
    
    plt.style.use('seaborn-whitegrid') 
    plt.figure(figsize=(8, 8), facecolor='lightgray')
    plt.title(f'\nBlue | X axis >> main | Y axis >> support\n\nOrange | X axis >> main | Y axis >> generated\n') 
    
    plt.scatter(X, Y1, s=0.1)    
    plt.scatter(X, Y2, s=0.1)
    
    plt.show() 
    

## Step #1
A -> G 까지 kernel 순차적으로 돌며 앙상블

In [None]:
# tps-jan-2021-gbdts-baseline
support = pd.read_csv(dfk.iloc[0, 2]) # A kernel
    
for k in range (1, 7):
    # B -> G
    main = pd.read_csv(dfk.iloc[k, 2])
    
    # (target_main * 0.7) + (target_support * 0.3)
    support = generate(main, support, 0.7)
    
sub1 = support 

## Result: 
[ A: (Score: 0.69864), B: (Score: 0.69846), ... , F: (Score: 0.69795), G: (Score: 0.69782) ] >>> sub1: (Score: **0.69781**)


In [None]:
print('sub1(Last Support)    | Score: 0.69781')
sub1.describe()

## Step #2
H kernel 과 Step 1 결과 앙상블|

In [None]:
# tabular-playground (H kernel)
main = pd.read_csv(dfk.iloc[7, 2])

# (target_main * 0.8) + (target_support * 0.2)
sub2 = generate(main, sub1, 0.8)

drawing1(main, sub1, sub2)

## Result:

[ H: (Score: 0.69756) , sub1: (Score: 0.69781) ] >>> sub2: (Score: **0.69755**)

In [None]:
print('sub2    | Score: 0.69755')
sub2.describe()

## Step #3
I kernel 과 step 2 결과 앙상블

In [None]:
# v48tabular-playground-series-xgboost-lightgbm (I kernel)
main = pd.read_csv(dfk.iloc[8, 2])

# (target_main * 0.55) + (target_support * 0.45)
sub3 = generate(main, sub2, 0.55)

drawing1(main, sub2, sub3)

## Result:

[ I: (Score: 0.69747) , sub2: (Score: 0.69755) ] >>> sub3: (Score: **0.69741**)

In [None]:
print('sub3    | Score: 0.69741')
sub3.describe()

## Step #4
J kernel 과 Step 3 결과 앙상블

In [None]:
# xgboost-hyperparameter-tuning-using-optuna (J kernel)
main = pd.read_csv(dfk.iloc[9, 2])

# (target_main * 0.55) + (target_support * 0.45)
sub4 = generate(main, sub3, 0.55)

drawing1(main, sub3, sub4)

## Result:

[ J: (Score: 0.69735) , sub3: (Score: 0.69741) ] >>> sub4: (Score: **0.69726**)


In [None]:
print('sub4    | Score: 0.69726')
sub4.describe()

## Step #5
K kernel 과 Step 4 결과 앙상블

In [None]:
#tabular-playground-some-slightly-useful-features (K kernel)
main = pd.read_csv(dfk.iloc[10, 2])

# (target_main * 0.55) + (target_support * 0.45)
sub5 = generate(main, sub4, 0.44)

drawing1(main, sub4, sub5)

## Result:

[ k: (Score: 0.69731) , sub4: (Score: 0.69726) ] >>> sub5: (Score: **0.69688**)

The difference in solutions between the two notebooks is the reason for the good progress at this stage. When solutions are different, they reinforce each other's weaknesses.

In [None]:
print('sub5    | Score: 0.69688')
sub5.describe()

## Submission

In [None]:
sub = sub5
sub.to_csv("submission.csv", index=False)

sub1.to_csv("submission1.csv", index=False)
sub2.to_csv("submission2.csv", index=False)
sub3.to_csv("submission3.csv", index=False)
sub4.to_csv("submission3.csv", index=False)

!ls