In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import time
import pickle

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)
print('Done!')

[SGD-LINK](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html#sklearn.linear_model.SGDClassifier)

[optuna-searchcv](https://optuna.readthedocs.io/en/stable/reference/generated/optuna.integration.OptunaSearchCV.html#optuna.integration.OptunaSearchCV)

In [None]:
# set up dask

!pip install --upgrade --quiet pip
!pip install --quiet dask-ml
print('Done!')

In [None]:
from dask.distributed import Client
import dask.dataframe as dd
import joblib

client = Client(n_workers=4)
client

In [None]:
def load_interim_df(folder):
    
    folder+='/'
    
    print('WARNING: Loading Datasets...')
    
    names = os.listdir(folder)
    datas = []
    
    for name in names:
        filename=folder+name
        data = pickle.load(open(filename, 'rb'))
        datas.append(data)

    return datas

In [None]:
folder='../input/mar-tab-final/'

_, X_val, X_train, y_val = load_interim_df(folder)

print(f'x_train shape is {X_train.shape}')
print(f'x_val shape is {X_val.shape}')
print(f'y_val shape is {y_val.shape}')

In [None]:
folder='../input/test-pkl'
test = load_interim_df(folder)[0]
print(f'test shape is {test.shape}')

In [None]:
folder='../input/y-train-mar'
y_train = load_interim_df(folder)[0]
print(f'y_train shape is {y_train.shape}')

In [None]:
try: # detect TPUs
    tpu = None
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver() # TPU detection
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
except Exception as e: # detect GPUs
    #strategy = tf.distribute.MirroredStrategy() # for GPU or multi-GPU machines
    strategy = tf.distribute.get_strategy() # default strategy that works on CPU and single GPU
    #strategy = tf.distribute.experimental.MultiWorkerMirroredStrategy() # for clusters of multi-GPU machines

print("Number of accelerators: ", strategy.num_replicas_in_sync)

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, f1_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from tqdm import tqdm
import matplotlib.pyplot as plt
from copy import deepcopy

import kerastuner as kt
import optuna
from sklearn import linear_model
from sklearn import model_selection
print('Imported!')

### Optimizing SGDC with Optuna

### Define The objective function

In [None]:
def objective(trial, xt=X_train, yt=y_train, xv=X_val, yv=y_val):
    
    base_params = {
    'max_iter':2000,
    'verbose':50,
    'n_jobs':-1,
    'random_state':123,
    'early_stopping': True,
    'class_weight': 'balanced',
    'average': True
    }

    # Define Base Params
    ###############################################################################################################
    
    base_params['loss'] = trial.suggest_categorical("loss", ["hinge", 'log', 'modified_huber'])
    base_params["alpha"] = trial.suggest_float("alpha", 0.0001, 0.01)
    base_params['learning_rate'] = trial.suggest_categorical("learning_rate", ["invscaling", "constant", "optimal", "adaptive"])

    # Define Param for learning-rate
    ###############################################################################################################
    
    if base_params['learning_rate'] != 'optimal':
        base_params['eta0'] = trial.suggest_float('eta0', 1e-4, 1e-2)
        if base_params['learning_rate'] == 'invscaling':
            base_params['power_t'] = trial.suggest_float('power_t', 0.2, 0.8)
        
    # Define Param for loss and penalty
    ###############################################################################################################
    
    if base_params['loss'] == 'hinge':
        base_params['penalty'] = 'l2'
    else:
        base_params['penalty'] = trial.suggest_categorical("penalty", ["l1", "l2", "elasticnet"]) 
        if base_params['penalty'] == 'elasticnet':
            base_params['l1_ratio'] = trial.suggest_float('l1_ratio', 0.1, 0.6)
    
    print()
    print(base_params)
    print()
    
    with strategy.scope():
        clf = SGDClassifier(**base_params)
        with joblib.parallel_backend('dask'):
            clf.fit(xt, yt)
            
    # Make Prediction and Obtain Metric
    ###############################################################################################################
    
    preds = clf.predict(xv)
    preds = np.rint(preds).astype('int32')
    
    f1 = f1_score(yv, preds)
    
    return f1

In [None]:
with strategy.scope():
    with joblib.parallel_backend('dask'):
        st=time.time()
        study = optuna.create_study(direction='maximize')
        study.optimize(objective, n_trials=1500, show_progress_bar=True)
        print("Number of finished trials: ", len(study.trials))
        print("Best trial:")
        trial = study.best_trial

        print("  Value: {}".format(trial.value))
        print("  Params: ")
        for key, value in trial.params.items():
            print("    {}: {}".format(key, value))
        print(f'Process ran for {time.time()-st} secs!')

### Extract The best Params

In [None]:
best_params = trial.params
best_params

### Let's see the top 10 runs

In [None]:
# Create a dataframe from the study.
df = study.trials_dataframe()
df.sort_values(by='value', ascending=False, inplace=True)
df.head(10)

### Deciding the most ideal Estimators to fit the Classifier

In [None]:
best_params['early_stopping'] = True
best_params['max_iter'] = 20000
best_params['verbose'] = int(best_params['max_iter']*0.01)
best_params['random_state'] = 123
best_params['n_jobs'] = -1

In [None]:
# Build a new model

sgd = SGDClassifier(**best_params)
sgd.get_params()

### train on the train set

In [None]:
with strategy.scope():
    with joblib.parallel_backend('dask'):
        st=time.time()
        sgd.fit(X_train, y_train)
        print(f'Took {time.time()-st} secs!')

### Extract the best iter

In [None]:
best_iter = sgd.n_iter_
best_iter

### Set the best iter and re-instantiate the model

In [None]:
best_params['max_iter'] = best_iter
best_params['early_stopping'] = False
best_params['verbose'] = True

# Reinstantiate the model
sgd = SGDClassifier(**best_params)
sgd.get_params()

### Use Kfold cross validation with best params on the data

In [None]:
# Define the Kfold strategy

folds = KFold(n_splits=10, shuffle=True, random_state=231)
print('Done!')

### Define a class of average folds to train a model with initial best-params

In [None]:
class AverageFoldsSGDC(object):
    
    def __init__(self, folds):
        self.folds = folds
        self.models = []
        
    def fit(self, X_train, y_train, model=sgd):
        # create out-of-folds prediction template
        
        try:
            assert isinstance(y_train, pd.Series)
        except AssertionError:
            y_train = pd.Series(y_train)
            
        oof_preds = np.zeros_like(y_train).reshape(-1,1)
        
        self.X_train = X_train
        self.y_train = y_train
        
        for train_idx, val_idx in tqdm(folds.split(X_train)):
            train_x, val_x = self.X_train.iloc[train_idx], self.X_train.iloc[val_idx]
            train_y, val_y = self.y_train.iloc[train_idx], self.y_train.iloc[val_idx]
            
            model.fit(train_x,
                      train_y.values.ravel())
            
            self.models.append(model)
            
            oof_pred = model.predict(val_x).reshape(-1, 1)
            oof_pred = np.rint(oof_pred).astype('int32')
            (unique, counts) = np.unique(oof_pred, return_counts=True)
            print('unique is', unique)
            
            oof_preds[val_idx] = oof_pred
            
        self.oof_preds = oof_preds
        
        
    def predict(self, X_test):
        preds = []
        for model in tqdm(self.models):
            pred = model.predict(X_test)
            preds.append(pred)
        preds = np.mean(preds, axis=0)
        preds = np.rint(preds).astype('int64')
        
        if preds.ndim >= 2:
            preds = preds.flatten()
        
        return preds

### Extend best params with base params


In [None]:
# from itertools import chain

# best_params = dict(chain.from_iterable(d.items() for d in (best_params, base_params)))
# best_params

In [None]:
with strategy.scope():
    with joblib.parallel_backend('dask'):
        st=time.time()
        model = AverageFoldsSGDC(folds)
        model.fit(X_train, y_train)
        print(f'Took {time.time()-st} secs!')

### Let' see the count of zeros and ones

In [None]:
(unique, counts) = np.unique(model.oof_preds, return_counts=True)
frequencies = np.asarray((unique, counts)).T
frequencies

In [None]:
pred = model.predict(X_val)

### F1-score

In [None]:
f1_score(y_val, pred)

### Accuracy

In [None]:
accuracy_score(y_val, pred)

### Disribution Plot

In [None]:
def distribution_plot(true, pred, true_name, pred_name, Title):
    plt.figure(figsize=(5,4), dpi=100)
    ax1 = sns.distplot(true, hist=False, color='r', label= true_name)
    ax2 = sns.distplot(pred, hist=False, color='b', label= pred_name, ax=ax1)
    
    plt.title(Title)
    plt.xlabel('Features')
    plt.ylabel('Target')
    
    plt.show()
    plt.close()

In [None]:
true = y_val
pred = pred
true_name = 'Target'
pred_name = 'y_hat'
Title = 'Target Vs Predictions Plot: SGDClassifier'

distribution_plot(true, pred, true_name, pred_name, Title)

### Make a Prediction on Test set

In [None]:
if True:
    prediction = model.predict(test)
    print('Done!')

### See spread of prediction

In [None]:
(unique, counts) = np.unique(prediction, return_counts=True)
frequencies = np.asarray((unique, counts)).T
frequencies

### Define a Submissions method

In [None]:
if True:
    def submissions(prediction=prediction):
        sample['target'] = prediction
        sample.to_csv('submission.csv', index=False)

In [None]:
sample = pd.read_csv('../input/tabular-playground-series-mar-2021/sample_submission.csv')
sample.head()

### Call the submissions method and save the model

In [None]:
if True:
    submissions()
    display(pd.read_csv('submission.csv').head())

### Save the Parameter-Search DataFrame for more analysis

In [None]:
!pip install --quiet gs-wrap
import gswrap
import datetime as dt
client = gswrap.Client('vibrant-reach-282320')
print('gswrap ready for use!')

In [None]:
def save_to_gcp(df, folder_name, file_name):
    try:
        assert file_name.endswith('.csv')
    except:
        file_name+='.csv'
    
    t = str(dt.datetime.now()).replace(' ', '_').split('.')[0]
    df.to_csv(file_name, index=False)
    
    with strategy.scope():
        st=time.time()
        print('Copying files...')
        client.cp(src=f"./{file_name}",
                  dst=f"gs://kaggle1980/Kaggle/GridSearch/{folder_name}/{file_name.split('.')[0]}_{t}.csv",
                  multithreaded=True)
        ed=time.time()
        memory = df.memory_usage().sum()
        print(f'1 file of size {memory} bytes copied in {ed-st} seconds!')

In [None]:
try:
    save_to_gcp(df, 'sgdc', 'sgd_grid')
except Exception as e:
    print(e)

### Save The model

In [None]:
import pickle

# save the model to disk
try:
    filename = 'sgd_model.sav'
    pickle.dump(model.models[0], open(filename, 'wb'))
except Exception as e:
    print(e)

### Save the Best Params

In [None]:
#Let's create a byte-stream placeholder object named 'xgb_params.pickle'
pickle_holder = open('sgd_params.pickle','wb')

# Now let's dump the 'xgb_params' data into 'xgb_params.pickle'
pickle.dump(best_params, pickle_holder)

# Finally, let's close the connection
pickle_holder.close()
print('Done!')