In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import cross_val_score, GridSearchCV

from sklearn.metrics import confusion_matrix, classification_report

In [None]:
test = pd.read_csv("../input/cat-in-the-dat/test.csv")
train = pd.read_csv("../input/cat-in-the-dat/train.csv")

In [None]:
train.head(10)

In [None]:
test.head(10)

Check for missing values 

In [None]:
train.isna().sum(axis=0)


In [None]:
test.isna().sum(axis=0)

## One Hot Encoding

In [None]:
num_cols = 0

for c in train.columns[1:-1]:
    num_cols += len(np.unique(train[c]))
    
print(num_cols)

In [None]:
train.nunique()

In [None]:
features = train.columns.to_list()
features.remove('id')
features.remove('target')
print(features)

In [None]:
encoder = OneHotEncoder(sparse = True, handle_unknown='ignore')
encoder.fit(train[features])
X_train = encoder.transform(train[features])
X_test = encoder.transform(test[features])

y_train = train.target.values

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape: ', X_test.shape)

In [None]:
pd.DataFrame(X_test)

# Logistic Regression

In [None]:
lr_mod = LogisticRegression(solver='liblinear')
lr_mod.fit(X_train, y_train)
print(lr_mod.score(X_train, y_train))

In [None]:
lr_cv_results = cross_val_score(lr_mod, X_train, y_train, cv=5, scoring='roc_auc')

print('Validation AUC by fold: ', lr_cv_results)
print('Average Validation AUC: ', np.mean(lr_cv_results))

# Logistic Regression: Hyper-Parameter Tuning

In [None]:
%%time 

lr_param_grid = {
    'C': [0.01, 0.1, 1]
}

lr_grid_search = GridSearchCV(lr_mod, lr_param_grid, cv=5, refit='True',n_jobs=-1)
lr_grid_search.fit(X_train, y_train)

lr_gs_res = lr_grid_search.cv_results_

print(lr_gs_res.keys())

In [None]:
best_lr_model = lr_grid_search.best_estimator_

from sklearn.model_selection import cross_validate

score=cross_validate(best_lr_model, X_train, y_train, cv=3, scoring="roc_auc")["test_score"].mean()
print(f"{score:.6f}")

# Make prediction 

In [None]:
pred=best_lr_model.predict_proba(X_test)[:,1]
print(pred[:10])

In [None]:
submission = pd.DataFrame({
    'id' : test.id,
    'target' : pred
})
submission.head()

In [None]:
submission.to_csv('my_submission.csv', index=False)