In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
# load processed train and validation data
train = pd.read_csv('../data/train_data_processed.csv')
valid = pd.read_csv('../data/valid_data_processed.csv')

In [3]:
# create X and y

X_train = train.drop(columns=['y'])
y_train = train['y']

X_valid = valid.drop(columns=['y'])
y_valid = valid['y']

In [4]:
print(X_train.shape)
print(y_train.shape)
print(X_valid.shape)
print(y_valid.shape)

(28824, 62)
(28824,)
(6176, 62)
(6176,)


## feature scaling

In [5]:
scaler = StandardScaler()

# fit on training data
scaler.fit(X_train)

# transform on train and valid
X_train_std = scaler.transform(X_train)
X_valid_std = scaler.transform(X_valid)

## baseline model - Logistic Regression

In [6]:
# starting with default parameters
lr = LogisticRegression(random_state=2020)

# fit the model
lr.fit(X_train_std, y_train)

LogisticRegression(random_state=2020)

In [7]:
# predictions
y_predict_valid = lr.predict_proba(X_valid_std)

In [8]:
# AUC score
print("AUC score of baseline Logistic Regression model: ", roc_auc_score(y_valid, y_predict_valid[:,1]))

AUC score of baseline Logistic Regression model:  0.7900684605814263


### observations:
Given that the [original paper]() regarding this problem stated that it achieved a best AUC score of 0.8, this results seems quite good for a simple baseline model with no hyperparameter tuning.