In [None]:
import numpy as np 
import pandas as pd

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df_train = pd.read_csv("/kaggle/input/car-crash-dataset/train-new.csv")
df_test = pd.read_csv("/kaggle/input/car-crash-dataset/test-new.csv")

In [None]:
df_test.head()

In [None]:
for c in df_train.columns:
    print(c, df_train[c].unique())

In [None]:
cat_feats = []
noncat_feats = []
for c in df_train.columns:
    if c in ["caseid", "dead", "injSeverity"]:
        continue
    if isinstance(df_train[c].iloc[0], str):
        cat_feats.append(c)
    else:
        noncat_feats.append(c)
cat_feats, noncat_feats

In [None]:
from sklearn.preprocessing import OneHotEncoder
onehot = OneHotEncoder(handle_unknown="ignore").fit(df_train[cat_feats])
data_onehot_train = onehot.transform(df_train[cat_feats])
data_onehot_test = onehot.transform(df_test[cat_feats]) 

In [None]:
onehot.get_feature_names()

In [None]:
df_cat_train = pd.DataFrame(data_onehot_train.toarray(), columns=onehot.get_feature_names())
df_noncat_train = df_train[noncat_feats]
df_cat_test = pd.DataFrame(data_onehot_test.toarray(), columns=onehot.get_feature_names())
df_noncat_test = df_test[noncat_feats]

In [None]:
X_train = pd.concat([df_cat_train, df_noncat_train], axis=1)
X_test = pd.concat([df_cat_test, df_noncat_test], axis=1)
y_train = df_train["dead"] == "dead"
y_test = df_test["dead"] == "dead"

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler().fit(X_train)
X_train_scl = scaler.transform(X_train)
X_test_scl = scaler.transform(X_test)
col_mean = np.nanmean(X_test_scl, axis=0)
inds = np.where(np.isnan(X_test_scl))
X_test_scl[inds] = np.take(col_mean, inds[1])

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.utils.testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

@ignore_warnings(category=ConvergenceWarning)
def train():
    model = LogisticRegressionCV(l1_ratios=[0.001, 0.01,0.1], penalty="elasticnet", solver="saga", cv=5).fit(X_train_scl, y_train)
    return model

model = train()

In [None]:
model.score(X_test_scl, y_test)

In [None]:
y_preds = model.predict_proba(X_test_scl)

In [None]:
y_preds[:,0]

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_preds[:, 1] > 0.25))