# Discription
This notebook work with two simple ML model: KNN and LogisticRegression.

Notebook plan:

1. Modules import.
2. Utils.
3. LGBM parameters tuning and modeling.
3. Full model training.

## Modules

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt # plotting
from sklearn.model_selection import train_test_split, cross_validate # creat train and test datasets to modeling
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score # report and metrics modules 


# ML models upload
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

# Additional models
import xgboost as xgb, lightgbm as lgbm


In [None]:
import warnings 
warnings.filterwarnings('ignore')

Do not forget to update sklearn vesrsion.

In [None]:
!pip install scikit-learn  -U

## 2. Utils

We use three difeerent functions:

1. reduce_mem_usage - to deduce dataset memory size 
2. show_proba_calibration_plots - to visualize the key characteristics of learning outcomes
3. get_classification_report - to see simple classification report

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif

                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
def show_proba_calibration_plots(y_predicted_probs, y_true_labels):
    preds_with_true_labels = np.array(list(zip(y_predicted_probs, y_true_labels)))

    thresholds = []
    precisions = []
    recalls = []
    f1_scores = []

    for threshold in np.linspace(0.1, 0.9, 50):
        thresholds.append(threshold)
        precisions.append(precision_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))
        recalls.append(recall_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))
        f1_scores.append(f1_score(y_true_labels, list(map(int, y_predicted_probs > threshold))))

    scores_table = pd.DataFrame({'f1':f1_scores,
                                 'precision':precisions,
                                 'recall':recalls,
                                 'probability':thresholds}).sort_values('f1', ascending=False).round(3)
  
    figure = plt.figure(figsize = (25, 12))

    plt1 = figure.add_subplot(121)
    plt1.plot(thresholds, precisions, label='Precision', linewidth=4)
    plt1.plot(thresholds, recalls, label='Recall', linewidth=4)
    plt1.plot(thresholds, f1_scores, label='F1', linewidth=4)
    plt1.set_ylabel('Scores')
    plt1.set_xlabel('Probability threshold')
    plt1.set_title('Probabilities threshold calibration')
    plt1.legend(bbox_to_anchor=(0.25, 0.25))   
    plt1.table(cellText = scores_table.values,
               colLabels = scores_table.columns, 
               colLoc = 'center', cellLoc = 'center', loc = 'bottom', bbox = [0, -1.1, 1, 1])

    plt2 = figure.add_subplot(122)
    plt2.hist(preds_with_true_labels[preds_with_true_labels[:, 1] == 0][:, 0], 
              label='Another class', color='royalblue', alpha=1)
    plt2.hist(preds_with_true_labels[preds_with_true_labels[:, 1] == 1][:, 0], 
              label='Main class', color='darkcyan', alpha=0.8)
    plt2.set_ylabel('Number of examples')
    plt2.set_xlabel('Probabilities')
    plt2.set_title('Probability histogram')
    plt2.legend(bbox_to_anchor=(1, 1))

    plt.show()

In [None]:
def get_classification_report(y_train_true, y_train_pred, y_test_true, y_test_pred):
    print('TRAIN\n\n' + classification_report(y_train_true, y_train_pred))
    print('TEST\n\n' + classification_report(y_test_true, y_test_pred))
    print('CONFUSION MATRIX\n')
    print(pd.crosstab(y_test_true, y_test_pred))

## Data load and work preparation

Download data and prepare it to modeling. 

In [None]:
train = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/train.csv')
test = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/test.csv')
sub = pd.read_csv('/kaggle/input/tabular-playground-series-nov-2021/sample_submission.csv')

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)
sub = reduce_mem_usage(sub)

In [None]:
train.info()

In [None]:
base_features = list(train.columns) [1:-1]

In [None]:
X = train[base_features]
y = train['target']
X_test_fin = test[base_features]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, shuffle=True, test_size=0.30)

In [None]:
seed = 0
fold = 2

## Modeling


In [None]:
'''
model_lgbm = lgbm.LGBMClassifier(
    num_iterations=100,
    objective = "binary",
    num_leaves= 31,
    feature_pre_filter = False
    )
def score(X, y, model_lgbm, cv):
    scoring = ["roc_auc"]
    scores = cross_validate(
        model_lgbm, X, y, scoring=scoring, cv=cv, return_train_score=True
    )
    scores = pd.DataFrame(scores).T
    return scores.assign(
        mean = lambda x: x.mean(axis=1),
        std = lambda x: x.std(axis=1),
    )

scores = score(X, y, model_lgbm, cv=fold)
display(scores)
'''

In [None]:
'''
def score(X, y, model_lgbm, cv):
    scoring = ["roc_auc"]
    scores = cross_validate(
        model_lgbm, X, y, scoring=scoring, cv=cv, return_train_score=True
    )
    scores = pd.DataFrame(scores).T
    return scores.assign(
        mean = lambda x: x.mean(axis=1),
        std = lambda x: x.std(axis=1),
    )

test_roc_auc_row = []

for num_iter in range(200, 700, 40):
    for max_d in range (8, 15, 1):
        model_lgbm = lgbm.LGBMClassifier(
        num_iterations=num_iter,
        objective = "binary",
        feature_pre_filter = False,
        max_depth = max_d
        )

        res = {}
        res['num_iter'] = num_iter
        res['max_depth'] = max_d
        scores = score(X, y, model_lgbm, cv=fold)
        res['test_roc_auc'] = scores.loc['test_roc_auc','mean']
        print(num_iter, max_d, res['test_roc_auc'])

        test_roc_auc_row.append(res)'''

In [None]:
'''df = pd.DataFrame(test_roc_auc_row)
df.sort_values(by='test_roc_auc', ascending=False).head(10)'''

## 4. Final mode train

In [None]:
model_fin = lgbm.LGBMClassifier(num_iterations = 240,max_depth = 13, eval_metric='auc')
model_fin.fit(X, y)

In [None]:
predictions = model_fin.predict_proba(X_test_fin)
y_pred_f = np.array([1 if x>=0.5 else 0 for x in predictions[:,1]])

In [None]:
sub['target'] = y_pred_f
sub.to_csv('submission.csv', index = 0)
sub