In [None]:
print(open("run_mice.R").read())

### Note: use read.table and write.table instead of read.csv and write.csv.

In [2]:
import os
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from imblearn.pipeline import make_pipeline
from lightgbm import LGBMClassifier
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import BayesianRidge
from sklearn.naive_bayes import GaussianNB


In [None]:
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

X = df_train.drop(['target', 'id'], axis=1)
y = df_train['target']

X_test = df_test.drop('id', axis=1)

In [3]:
hex_columns = ['f2', 'f3', 'f13', 'f18', 'f20', 'f26']
ordinal_columns = ['f6', 'f4', 'f8', 'f16', 'f17', 'f19', 'f21', 'f25']
categorical_columns = ['f1', 'f5', 'f7', 'f9', 'f11', 'f24']
ordinal_cat_columns = ['f0', 'f12', 'f23', 'f27']
binary_columns = ['f10', 'f28', 'f22', 'f14']
all_columns = ['f' + str(i) for i in range(0, 29)]
removed_cols = []

In [4]:
def conv_hex(df: pd.DataFrame) -> None:
    def conv_hex_map(x):
        try:
            return int(x, 16)
        except ValueError as e:
            return np.nan
        except TypeError as e:
            return np.nan

    for col in hex_columns:
        if col not in list(df.columns):
            continue
        col_loc = list(df.columns).index(col)
        df[col] = df[col].apply(lambda x: conv_hex_map(x))


def conv_bool(df: pd.DataFrame):
    def conv_bool_map(x):
        try:
            if not type(x) == str:
                return x
            if x.lower() == 'f':
                return 0
            elif x.lower() == 't':
                return 1
            return x
        except Exception as e:
            return np.nan
    df['f14'] = df['f14'].apply(lambda x: conv_bool_map(x))


def conv_binary(df: pd.DataFrame):
    def binaryToDecimal(binary):
        try:
            binary = int(binary)
        except:
            return np.nan

        binary1 = binary
        decimal, i, n = 0, 0, 0
        while(binary != 0):
            dec = binary % 10
            decimal = decimal + dec * pow(2, i)
            binary = binary//10
            i += 1
        return decimal
    for col in binary_columns:
        if col not in list(df.columns):
            continue
    df[col] = df[col].apply(lambda x: binaryToDecimal(x))
    
def remove_duplicate_columns(df: pd.DataFrame) -> None:
    cols_to_drop = []
    cols = list(df.columns)
    for col in df.columns:
        if col in cols:
            cols.remove(col)
        for col2 in cols:
            if df[col].equals(df[col2]):
                cols_to_drop.append(col2)

    df.drop(columns=cols_to_drop, inplace=True)

def transform_categorical(df: pd.DataFrame, test=False) -> None:
    cols = list(set(df.columns).intersection(set(categorical_columns).union(set(ordinal_cat_columns))))
    for col in cols:
        df[col] = df[col].astype('category')
        df[col] = df[col].cat.codes

def remove_duplicates(df: pd.DataFrame) -> None:
    df.drop_duplicates(inplace=True)
    
def conv_columns(df: pd.DataFrame, test=False) -> None:
    conv_hex(df)
    conv_bool(df)
    conv_binary(df)
    return transform_categorical(df, test)

In [None]:
def run_mice():
    infile = 'train_processed.csv'
    outfile = 'train_imputed.csv'
    # np.savetxt(infile, data.to_numpy().tolist(), delimiter=",")
    os.system(f'type nul > {outfile}')
    os.system('"C:\Program Files\R\R-4.1.1\bin\Rscript.exe" --vanilla run_mice.R %s %s' % (infile, outfile))
    data_imputed = pd.read_csv(outfile)

In [None]:
def preprocess(df, test=False):
    if test:
        df.loc[26648, 'f9'] = np.nan
        df.loc[20956, 'f15'] = np.nan
        df.loc[21034, 'f15'] = np.nan
        
    remove_duplicate_columns(df)
    remove_duplicates(df)

    conv_columns(df, test)
    
    if test:
        df.to_csv('./test_processed.csv', index=False)
    else:
        df.to_csv('./train_processed.csv', index=False)

In [None]:
# preprocess(df_train)
preprocess(df_test, True)

In [None]:
run_mice()

In [None]:
round(100*(X_test_pre.isnull().sum()/len(X_test_pre.index)),2)

In [None]:
from imblearn.over_sampling import SMOTE

for col in X_train_pre:
    if X_train_pre[col].isnull().sum() > 0:
        X_train_pre[col].fillna(X_train_pre[col].mean(), inplace=True)

sm = SMOTE(sampling_strategy='minority', random_state=7)
X_cols = X_train_pre.columns

X_train_pre, y_train_pre = sm.fit_resample(X_train_pre, y_train)

X_train = pd.DataFrame(X_train, columns=X_cols)

y_train.value_counts()

In [None]:
X_test.head()

In [5]:
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split

def split(X : pd.DataFrame, y : pd.DataFrame, size=.2):
    return train_test_split(X, y, test_size=size, stratify=y)

def fit(data : pd.DataFrame, labels : pd.DataFrame, classifier : any, eval_pool = None) -> None:
    if eval_pool is not None:
        classifier.fit(data, labels, eval_set=eval_pool)
    else:
        classifier.fit(data, labels)

def predict_proba(classifier : any, test : pd.DataFrame) -> np.array:
    return classifier.predict_proba(test)

def predict(classifier : any, test : pd.DataFrame) -> np.array:
    return classifier.predict(test)

def print_accuracy(pred : np.array, test : pd.DataFrame, name : str):
    print(name + ' Model accuracy score: {0:0.4f}'.format(roc_auc_score(test, pred[:, 1])))

In [37]:
imp_data = pd.read_csv('train_imputed.csv')

X_train_pre = imp_data.drop(['target', 'id'], axis=1)
y_train = imp_data['target']

df_test = pd.read_csv('test_imputed.csv')
X_test_pre = df_test.drop('id', axis=1)

train_cat = list(set(X_train_split.columns).intersection(set(categorical_columns)))

for col in X_train_pre:
    if X_train_pre[col].isnull().sum() > 0:
        X_train_pre[col].fillna(method='bfill', inplace=True)
    
    # if col in train_cat:
        # X_train_pre[col] = X_train_pre[col].astype('category')
        
for col in X_test_pre:
    if X_test_pre[col].isnull().sum() > 0:
        X_test_pre[col].fillna(method='bfill', inplace=True)
    
    # if col in train_cat:
        # X_train_pre[col] = X_train_pre[col].astype('category')
        
remove_duplicate_columns(X_train_pre)
remove_duplicate_columns(X_test_pre)

# scaler = StandardScaler()

# cols_to_transform = list(set(X_train_pre.columns) - set(train_cat))

# X_train_pre[cols_to_transform] = scaler.fit_transform(X_train_pre[cols_to_transform], y_train)
# X_test_pre[cols_to_transform] = scaler.transform(X_test_pre[cols_to_transform])

X_train_split, X_test_split, y_train_split, y_test_split = split(X_train_pre, y_train, size=.2)
        
# X_train_pre.head()

In [None]:
from catboost import CatBoostClassifier, Pool

train_cat = list(set(X_train_split.columns).intersection(set(categorical_columns)))
test_cat = list(set(X_test_split.columns).intersection(set(categorical_columns)))
cats = []
for col in train_cat:
    cats.append(X_train_pre.columns.get_loc(col))
    
print(cats)
print(train_cat)
print(test_cat)

train_dataset = Pool(X_train_split ,y_train_split, cat_features=train_cat)
test_dataset = Pool(X_test_split, y_test_split, cat_features=test_cat)

model = CatBoostClassifier(loss_function='Logloss', eval_metric='AUC')

eval_pool = Pool(X_test_split, y_test_split, cat_features=cats)

# clf = CatBoostClassifier(loss_function='Logloss',cat_features=cats,eval_metric= 'AUC',depth= 1,learning_rate= 1,l2_leaf_reg= 5,iterations= 2000)

clf = CatBoostClassifier(depth=1, learning_rate=1, iterations=2000, stratified=True)

fit(X_train_pre, y_train, clf, eval_pool)

In [None]:
grid = {'learning_rate': [0.03, 0.1, 1],
        'depth': [1, 2, 4, 6, 10],
        'l2_leaf_reg': [1, 3, 5],
        'iterations': [50, 100, 150, 500, 1000, 2000]}

model.grid_search(grid,train_dataset, verbose=False)

In [None]:
model.get_params()

In [None]:
from xgboost import XGBClassifier

clf = XGBClassifier(max_depth=6, eta=.125, objective='binary:logistic', use_label_encoder=False)

fit(X_train_split, y_train_split, clf)

In [None]:
from lightgbm import LGBMClassifier

clf = LGBMClassifier()
fit(X_train_split, y_train_split, clf)

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier

clf = AdaBoostClassifier(n_estimators=1000)
fit(X_train_split, y_train_split, clf)

In [None]:
X_train_pre.loc[~(X_train_pre['f18'] == X_train_pre['f26'])]

In [None]:
models = {}

# Logistic Regression
from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

# Decision Trees
from sklearn.tree import DecisionTreeClassifier
models['Decision Trees'] = DecisionTreeClassifier()

# Random Forest
from sklearn.ensemble import RandomForestClassifier
models['Random Forest'] = RandomForestClassifier()

# Naive Bayes
from sklearn.naive_bayes import GaussianNB
models['Naive Bayes'] = GaussianNB()

# K-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
models['K-Nearest Neighbor'] = KNeighborsClassifier()

from catboost import CatBoostClassifier
models['Catboost'] = CatBoostClassifier(iterations=2000, depth=1, learning_rate=1, verbose=False)

from sklearn.linear_model import LogisticRegression
models['Logistic Regression'] = LogisticRegression()

from lightgbm import LGBMClassifier
models['Light GBM'] = LGBMClassifier()

from sklearn.ensemble import GradientBoostingClassifier
models['GBM'] = GradientBoostingClassifier()

from sklearn.metrics import accuracy_score, precision_score, recall_score

accuracy, precision, recall, auc = {}, {}, {}, {}

for key in models.keys():
    
    # Fit the classifier model
    models[key].fit(X_train_split, y_train_split)
    
    # Prediction 
    predictions = models[key].predict_proba(X_test_split )
    
    # Calculate AUC
    auc[key] = roc_auc_score(y_test_split, predictions[:, 1])
    

df_model = pd.DataFrame(index=models.keys(), columns=['Auc'])
df_model['Auc'] = auc.values()

df_model

In [7]:
%%time

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

models = {}
# models['rf'] = RandomForestClassifier(random_state=42)
# models['Catboost'] = CatBoostClassifier(boosting_type='Plain', gpu_cat_features_storage = 'CpuPinnedMemory', max_ctr_complexity=1, iterations=1000, depth=1, learning_rate=1, verbose=False, random_state=42, task_type="GPU", devices='0:1')
models['lgbm'] = LGBMClassifier(random_state=42, device='gpu')
# models['gbm'] = GradientBoostingClassifier(min_samples_split=500,min_samples_leaf=50,max_depth=8, subsample=0.8, random_state=42)
# models['xgb'] = XGBClassifier(tree_method='gpu_hist', gpu_id=0, verbosity = 0)
clf = StackingClassifier(estimators = list(models.items()), final_estimator=LogisticRegression(), cv=10)

params = {# 'rf__n_estimators': [5, 10, 100], 
          'lgbm__max_depth': [6,7], 
          'lgbm__num_leaves': [70, 80], 
          # 'gbm__n_estimators': range(20,81,10), 
          # 'gbm__learning_rate': [1, .1, .01]
 #            'xgb__max_depth': [5, 10, 20],
 #            'xgb__n_estimators': [10, 100, 1000],
 #            'xgb__learning_rate': [1, .1, .01]
        }

# grid = GridSearchCV(estimator=clf, param_grid=params, cv=5, n_jobs=-1, scoring='roc_auc', refit=True, verbose=100)
grid = RandomizedSearchCV(estimator=clf, param_distributions=params, n_iter=10, cv=5, n_jobs=-1, scoring='roc_auc', refit=True, verbose=10)
grid.fit(X_train_pre, y_train)



Fitting 5 folds for each of 4 candidates, totalling 20 fits
Wall time: 3min 35s


RandomizedSearchCV(cv=5,
                   estimator=StackingClassifier(cv=10,
                                                estimators=[('lgbm',
                                                             LGBMClassifier(device='gpu',
                                                                            random_state=42))],
                                                final_estimator=LogisticRegression()),
                   n_jobs=-1,
                   param_distributions={'lgbm__max_depth': [6, 7],
                                        'lgbm__num_leaves': [70, 80]},
                   scoring='roc_auc', verbose=10)

In [7]:
%%time


from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, StackingClassifier
from catboost import CatBoostClassifier
from sklearn.neural_network import MLPClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier


models = {}
models['rf'] = RandomForestClassifier(random_state=42)
models['Catboost'] = CatBoostClassifier(boosting_type='Plain', gpu_cat_features_storage = 'CpuPinnedMemory', max_ctr_complexity=1, iterations=1000, depth=1, learning_rate=1, verbose=False, random_state=42, task_type="GPU", devices='0:1')
models['lgbm'] = LGBMClassifier(random_state=42, device='gpu')
models['gbm'] = GradientBoostingClassifier(min_samples_split=500,min_samples_leaf=50,max_depth=8, subsample=0.8, random_state=42)
models['xgb'] = XGBClassifier(tree_method='gpu_hist', gpu_id=0, verbosity = 0, objective='binary:logistic', silent=True)

# clf = StackingClassifier(estimators = list(models.items()), final_estimator=LogisticRegression(), cv=10)




Wall time: 161 ms


In [17]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

def grid(estimator, params, n_jobs=-1):
    grid_cv = GridSearchCV(estimator=estimator, param_grid=params, cv=5, n_jobs=n_jobs, scoring='roc_auc', refit=True, verbose=10)
    grid_cv.fit(X_train_pre, y_train)
    return grid_cv

In [11]:
%%time
lgbm_params = {
        'learning_rate': [1],
        'n_estimators': [24, 32, 52],
        'num_leaves': [16], # large num_leaves helps improve accuracy but might lead to over-fitting
        'boosting_type' : ['dart'], # for better accuracy -> try dart
        'objective' : ['binary'],
        'max_bin':[255], # large max_bin helps improve accuracy but might slow down training progress
        'colsample_bytree' : [0.64],
        'subsample' : [0.7],
}

lgbm_grid = grid(models['lgbm'], lgbm_params)
print(f'Best params lgbm: {lgbm_grid.best_params_}')
print(f'Best score lgbm: {lgbm_grid.best_score_}')

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Best params lgbm: {'boosting_type': 'dart', 'colsample_bytree': 0.64, 'learning_rate': 1, 'max_bin': 255, 'n_estimators': 32, 'num_leaves': 16, 'objective': 'binary', 'subsample': 0.7}
Best score lgbm: 0.7322889897844772
Wall time: 3.15 s


In [19]:
%%time
cat_params = {'iterations': [500, 1000, 2000],
              'depth': [1, 4, 5, 6],
              'loss_function': ['Logloss', 'CrossEntropy'],
              'l2_leaf_reg': np.logspace(-20, -19, 3),
              'leaf_estimation_iterations': [10],
}

cat_grid = grid(models['Catboost'], cat_params, 2)
print(f'Best params lgbm: {cat_grid.best_params_}')
print(f'Best score lgbm: {cat_grid.best_score_}')

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Best params lgbm: {'depth': 1, 'iterations': 2000, 'l2_leaf_reg': 1e-20, 'leaf_estimation_iterations': 10, 'loss_function': 'CrossEntropy'}
Best score lgbm: 0.7598584828689188
Wall time: 1h 19min 39s


In [None]:
%%time
xgb_params = {'min_child_weight': [1, 5, 10],
                'gamma': [0.5, 1, 1.5, 2, 5],
                'subsample': [0.6, 0.8, 1.0],
                'colsample_bytree': [0.6, 0.8, 1.0],
                'max_depth': [3, 4, 5],
              'n_estimators': [10, 100, 500, 1000],
              'learning_rate': [1, .1, .01],
}

xgb_grid = grid(models['xgb'], xgb_params)
print(f'Best params xgb: {xgb_grid.best_params_}')
print(f'Best score xgb: {xgb_grid.best_score_}')

Fitting 5 folds for each of 4860 candidates, totalling 24300 fits


In [33]:
cat_clf_params = {
    'depth': 1, 
    'iterations': 2000, 
    'l2_leaf_reg': 1e-20, 
    'leaf_estimation_iterations': 10, 
    'loss_function': 'CrossEntropy',
    'boosting_type': 'Plain', 
    'gpu_cat_features_storage': 'CpuPinnedMemory',
    'max_ctr_complexity':1,
    'iterations':1000,
    'depth':1, 
    'learning_rate':1, 
    'verbose':False, 
    'random_state':42, 
    'task_type':"GPU", 
    'devices':'0:1'
}

lgbm_clf_params = {
    'boosting_type': 'dart', 
    'colsample_bytree': 0.64, 
    'learning_rate': 1, 
    'max_bin': 255, 
    'n_estimators': 32, 
    'num_leaves': 16, 
    'objective': 'binary', 
    'subsample': 0.7,
    'random_state':42, 
    'device':'gpu'
}

stacking_models = {}

stacking_models['cat'] = CatBoostClassifier(**cat_clf_params)
stacking_models['lgbm'] = LGBMClassifier(**lgbm_clf_params)

clf = StackingClassifier(estimators = list(stacking_models.items()), final_estimator=LogisticRegression(), cv=10)
fit(X_train_pre, y_train, clf)

In [8]:
grid.get_params()

{'cv': 5,
 'error_score': nan,
 'estimator__cv': 10,
 'estimator__estimators': [('lgbm',
   LGBMClassifier(device='gpu', random_state=42))],
 'estimator__final_estimator__C': 1.0,
 'estimator__final_estimator__class_weight': None,
 'estimator__final_estimator__dual': False,
 'estimator__final_estimator__fit_intercept': True,
 'estimator__final_estimator__intercept_scaling': 1,
 'estimator__final_estimator__l1_ratio': None,
 'estimator__final_estimator__max_iter': 100,
 'estimator__final_estimator__multi_class': 'auto',
 'estimator__final_estimator__n_jobs': None,
 'estimator__final_estimator__penalty': 'l2',
 'estimator__final_estimator__random_state': None,
 'estimator__final_estimator__solver': 'lbfgs',
 'estimator__final_estimator__tol': 0.0001,
 'estimator__final_estimator__verbose': 0,
 'estimator__final_estimator__warm_start': False,
 'estimator__final_estimator': LogisticRegression(),
 'estimator__n_jobs': None,
 'estimator__passthrough': False,
 'estimator__stack_method': 'auto

In [34]:
prediction = predict_proba(clf, X_test_split)

print_accuracy(prediction, y_test_split, 'stack')


real_pred = predict_proba(clf, X_test_pre)

f = open("./pred.csv", "w")
f.write("id,target\n")
id_nr = 50000
for v in real_pred[:, 1]:
    f.write(f"{id_nr},{v}\n")
    id_nr += 1
f.close()

stack Model accuracy score: 0.7689


In [None]:
X_train.head()

In [None]:
from catboost import Pool, cv

cv_dataset = Pool(data=X_train_pre,
                  label=y_train,
                  cat_features=list(set(X_train_pre.columns).intersection(set(categorical_columns))))

params = {"iterations": 2000,
          "depth": 2,
          "learning_rate": 1,
          "loss_function": "Logloss",
          "verbose": False,
          "roc_file": "roc-file"}

scores = cv(cv_dataset,
            params,
            fold_count=2, 
            plot="True")

In [None]:
scores