In [None]:
import os
import random
import numpy as np
import pandas as pd
from sklearn import datasets
from joblib import Parallel, delayed, dump, load
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression

In [None]:
PATH = '../input/tabular-playground-series-nov-2021'
train = pd.read_csv(os.path.join(PATH, 'train.csv'))
test = pd.read_csv(os.path.join(PATH, 'test.csv'))
sub = pd.read_csv(os.path.join(PATH, 'sample_submission.csv'))

In [None]:
train.shape

In [None]:
test.shape

In [None]:
sub.shape

In [None]:
train.head()

In [None]:
test.head()

In [None]:
columns = train.columns[1:-1]

In [None]:
scaler = preprocessing.RobustScaler()
train[columns] = scaler.fit_transform(train[columns])
test[columns] = scaler.fit_transform(test[columns])

In [None]:
N_FOLDS = 5
SEED = 42

def random_seed(seed=SEED):
    random.seed(SEED)
    np.random.seed(SEED)
    os.environ['PYTHONHASHSEED'] = str(SEED)

def auc_score(y_true, y_pred):
    auc_score = roc_auc_score(y_true, y_pred)
    return auc_score


def run_fold(fold, df):
    
    test_preds = 0
    
    df_train = df.loc[df['kfold']!=0].reset_index(drop=True)
    df_valid = df.loc[df['kfold']==0].reset_index(drop=True)
    
    model = LogisticRegression(n_iters=100000, lr=0.0001)
    model.fit(df_train.iloc[:,1:-2].values, df_train['target'].values)
    predictions = model.predict(df_valid.iloc[:,1:-2].values)
    dump(model, f'fold{fold}.sav') 
    print("LR classification auc: ", auc_score(df_valid['target'].values, predictions))
    

class LogisticRegression:
    
    def __init__(self, n_iters = 1, lr = 0.0001):
        self.n_iters = n_iters
        self.lr = lr
        
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weight = np.zeros(n_features)
        self.bias = 0 
        
        for _ in range(self.n_iters):
            predicted = self._sigmoid(np.dot(X, self.weight) + self.bias)
            
            dw = 1/n_samples * np.dot(X.T, (predicted - y))
            db = 1/n_samples * np.sum(predicted - y)
            
            self.weight -= self.lr * dw
            self.bias -= self.lr * db
        return self
            
    def predict(self, X):
        linear_model = np.dot(X, self.weight) + self.bias
        y_predicted = self._sigmoid(linear_model)
        return y_predicted
            
    def _sigmoid(self, X):
        sig = 1/(1+np.exp(-X))
        return sig
    
    
if __name__ == "__main__":
    random_seed(seed = SEED)
    df = train.sample(frac=1).reset_index(drop=True)
    kfold = StratifiedKFold(n_splits = 10, shuffle=True, random_state=SEED)
    for f, (t_, v_) in enumerate(kfold.split(df, df['target'])):
        df.loc[v_, 'kfold'] = f
        
    delayed_func = [delayed(run_fold)(fold, df) for fold in range(N_FOLDS)]
    Parallel(n_jobs = N_FOLDS, prefer = 'threads')(delayed_func)


In [None]:
def run_inference(test):
    test_preds = 0
    for f in range(N_FOLDS):
        clf = load(f'fold{f}.sav')
        preds = clf.predict(test.iloc[:,1:].values)
        test_preds += preds/N_FOLDS
    return test_preds
        

In [None]:
random_seed(seed = SEED)
test_preds = run_inference(test)
sub['target'] = test_preds
sub.to_csv('submission.csv', index=False)