# Non-NN models

In [12]:
input_path = './input/'
output_path = './output/pure_non_nn_results/'

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score

def load_raw_data(train_or_test='train'):
    file_name = f'{input_path}/{train_or_test}.csv'
    df = pd.read_csv(file_name)
    return df

def load_label(train_or_test='train'):
    file_name = input_path + ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    return df['state'].values

def competition_metric(y_true, y_score):
    return roc_auc_score(y_true, y_score)

def evaluate(model, X, y):
    return competition_metric(y, model.predict_proba(X)[:, 1])

def to_csv(arr,train_or_test='train',name=None):
    df = pd.DataFrame(arr)
    if type(name)==str:
        df.to_csv(f'{output_path}/{name}_{train_or_test}_v4.csv', index = False )
    else:
        df.to_csv(f'{output_path}/{train_or_test}_v4.csv', index = False )

def ans_to_csv(arr,train_or_test='train',name=None):
    file_name = input_path + ('train_labels.csv' if train_or_test=='train' else 'sample_submission.csv')
    df = pd.read_csv(file_name)
    df['state'] = arr
    if type(name)==str:
        df.to_csv(f'{output_path}/{name}_{train_or_test}.csv', index = False )
    else:
        df.to_csv(f'{output_path}/{train_or_test}.csv', index = False )

def submit(arr):
    df = pd.read_csv(f'{input_path}/sample_submission.csv')
    df['state'] = arr
    df.to_csv(f'{output_path}/submission.csv', index=False)

In [None]:


def group_splitter(df, nfold=5, random_state=None):
    subject_nums = df['subject'].unique()
    rng = np.random.default_rng(random_state)
    subject_to_setnum = rng.integers(0, nfold, subject_nums.shape[0])
    for i in range(nfold):
        val_subjects = subject_nums[subject_to_setnum == i]
        mask_df_val = df['subject'].isin(val_subjects)
        mask_y_val = mask_df_val.iloc[::60]
        yield mask_df_val, mask_y_val

In [None]:
from tbr_apr_mds import CorrExtractor ,ElementaryExtractor, TsfreshExtractor,MBOP

In [None]:
def short_test(x,y,n):
    return df.loc[df.sequence<n] ,y[:n]

In [8]:
df = load_raw_data('train')
y = load_label('train')
df_test = load_raw_data("test")
y_test = load_label("test")

In [None]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import make_union
from sklearn.metrics import classification_report
clfs = [
            LGBMClassifier(n_estimators=200, random_state=55,max_depth=10,boosting_type="dart",metric="auc"),
            LGBMClassifier(n_estimators=200, random_state=62,max_depth=10,boosting_type="dart",metric="auc"),
            LGBMClassifier(n_estimators=200, random_state=1,max_depth=-1,boosting_type="dart",metric="auc"),
            LGBMClassifier(n_estimators=200, random_state=42,max_depth=-1,boosting_type="dart",metric="auc"),
            LGBMClassifier(n_estimators=200, random_state=42,max_depth=-1,boosting_type="goss",metric="auc"),
]

extractors = [ ElementaryExtractor(), TsfreshExtractor(), MBOP(window_size=4,word_size=4,n_bins=10,m_occur=0.006)]
extractor = make_union(*extractors)
train_preds_array = np.zeros(len(clfs)*len(y)).reshape(-1,len(clfs))
test_preds_array = np.zeros(len(clfs)*len(y_test)).reshape(-1,len(clfs))
p=0
for mask_df_val, mask_y_val in group_splitter(df, nfold=5, random_state=21):
    if p==4:
        p+=1
        df_train, y_train = df[~mask_df_val], y[~mask_y_val]
        df_val, y_val = df[mask_df_val], y[mask_y_val]
        X_train = extractor.fit_transform(df_train)
        X_val = extractor.transform(df_val)
        print(X_train.shape, X_val.shape)
        for clf,i in zip(clfs,range(len(clfs))):
            clf.fit(X_train, y_train,eval_set=[(X_val,y_val)],verbose=100)
            X_train_F = extractor.transform(df)
            X_test_F = extractor.transform(df_test)
            train_preds_array[:,i] += clf.predict_proba(X_train_F)[:,1]
            test_preds_array[:,i] += clf.predict_proba(X_test_F)[:,1]
            print(clf)
            print(evaluate(clf, X_train, y_train))
            print(evaluate(clf, X_val, y_val))
            print(classification_report(y_val, (clf.predict(X_val) >= 0.5).astype(int), digits=4 ))
    else:
        p+=1
to_csv(train_preds_array,"train")
to_csv(test_preds_array,"test")

In [None]:
# clf = LGBMClassifier(num_leaves=31, max_depth=4, n_estimators=100)

# df_train_final = df
# y_train_final = y
# X_train_final = extractor.fit_transform(df_train_final)
# clf.fit(X_train_final, y_train_final)

# df_test_final = load_raw_data('test')
# X_test_final = extractor.transform(df_test_final)
# y_pred = clf.predict_proba(X_test_final)[:, 1]
# submit(y_pred)