In [None]:
import numpy as np
import pandas as pd
import scipy
import sklearn

In [None]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_validate,cross_val_score,StratifiedKFold

# Read data

In [None]:
df_train = pd.read_csv('/kaggle/input/cat-in-the-dat/train.csv', index_col='id')
df_test = pd.read_csv('/kaggle/input/cat-in-the-dat/test.csv', index_col='id')

In [None]:
binvar = ['bin_' + str(i) for i in range(1,5)]
ordvar = ['ord_' + str(i) for i in range(6)]
nomvar = ['nom_' + str(i) for i in range(10)]
dmvar  = ['day', 'month']

In [None]:
train_test = df_train.copy()
y_train = train_test['target'].copy()
train_test = pd.concat([train_test.drop('target', axis = 1), df_test])

# Bin vars

In [None]:
train_test.drop('bin_0', inplace=True, axis=1)

In [None]:
train_test['bin_3'] = train_test['bin_3'].map({'F':0, 'T':1})
train_test['bin_4'] = train_test['bin_4'].map({'N':0, 'Y':1})

# Ord vars

In [None]:
train_test['ord_0'] = train_test['ord_0'] - 1

In [None]:
ord1dict = {'Novice':0, 'Contributor':1, 'Expert':2, 'Master':3, 'Grandmaster':4}
train_test['ord_1'] = train_test['ord_1'].map(ord1dict)

In [None]:
ord2dict = {'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}
train_test['ord_2'] = train_test['ord_2'].map(ord2dict)

In [None]:
oe = OrdinalEncoder(categories='auto')
train_test[ordvar[3:]] = oe.fit_transform(train_test[ordvar[3:]])
for var, cl in zip(ordvar[3:], oe.categories_):
    print(var)
    print(cl)

# Scaling ord vars

In [None]:
train_test[ordvar] = StandardScaler().fit_transform(train_test[ordvar])

# Nom vars

In [None]:
train_test[nomvar[5:]].nunique()

In [None]:
train_test['nom_5'] = train_test['nom_5'].str[4:]
train_test['nom_6'] = train_test['nom_6'].str[3:]
train_test['nom_7'] = train_test['nom_7'].str[3:]
train_test['nom_8'] = train_test['nom_8'].str[3:]
train_test['nom_9'] = train_test['nom_9'].str[3:]

In [None]:
train_test[nomvar[5:]].nunique()

# One hot encoding nomvars & dmvars

In [None]:
enc = OneHotEncoder(categories = 'auto', dtype = 'float64', drop = 'first')
nom_matrix = enc.fit_transform(train_test[nomvar])
train_test.drop(nomvar, inplace=True, axis=1)

In [None]:
enc = OneHotEncoder(categories='auto', dtype = 'float64', drop = 'first')
dm_matrix = enc.fit_transform(train_test[dmvar])
train_test.drop(dmvar, inplace=True, axis=1)

In [None]:
train_test.columns

In [None]:
df_work_sprs =scipy.sparse.hstack([nom_matrix,
                                   scipy.sparse.coo_matrix(train_test).astype('float64'),
                                   dm_matrix]).tocsr()
display(df_work_sprs)

In [None]:
X_train = df_work_sprs[:y_train.shape[0]]
X_test = df_work_sprs[y_train.shape[0]:]

# optuna

## bayesian optimization

In [None]:
import optuna

In [None]:
kf=StratifiedKFold(5)

In [None]:
def objective(trial):
    clasas_weight=trial.suggest_uniform('clasas_weight', 1, 2)
    model=LogisticRegression(C=0.123456789, class_weight={0:1, 1:clasas_weight},max_iter=10000, solver='lbfgs', n_jobs=-1)
    score=-cross_val_score(model, X_train, y_train, cv=kf, scoring='roc_auc').mean()
    return score
study=optuna.create_study()

In [None]:
study.optimize(objective, n_trials=50)

In [None]:
print(study.best_params)
print(-study.best_value)
params=study.best_params

# Cross-validation

In [None]:

C = 0.123456789

clf = LogisticRegression(C=C, solver='lbfgs', max_iter=1000, class_weight={0:1, 1:1.0391707400969836}, verbose=0, n_jobs=-1)


score = cross_validate(clf, X_train, y_train, cv=3, scoring="roc_auc")
mean = score['test_score'].mean()
print(score['test_score'])
print('C =', C, f'{mean:.8f}')

# Training

In [None]:

clf = LogisticRegression(C=C, solver='lbfgs',class_weight={0:1, 1:1.0391707400969836}, max_iter=1000, verbose=0, n_jobs=-1)
clf.fit(X_train, y_train)

# Creating submission file

In [None]:
y_preds = clf.predict_proba(X_test)[:,1]

In [None]:
submission = pd.read_csv('/kaggle/input/cat-in-the-dat/sample_submission.csv', index_col='id')

In [None]:
submission['target'] = y_preds
submission.to_csv('submission.csv')