In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import model_selection, linear_model, metrics, pipeline, preprocessing, impute


# Import data: for comfort, select the target column in a separate object.

In [None]:
path_data='../input/tabular-playground-series-sep-2021'
X_test = pd.read_csv(path_data+'/test.csv')
train_df = pd.read_csv(path_data+'/train.csv')
TARGET = train_df.columns[-1]

y_train = train_df[train_df.columns[-1]]
X_train = train_df.drop(train_df.columns[-1], axis=1)
print(f'Test shape: {X_test.shape}, X_train shape: {X_train.shape}, y_train shape: {y_train.shape}')
print(f'Count of null-values: {X_train.isna().sum().sum()}')

In [None]:
X_train.head()

In [None]:
X_train.describe().T

# Visualizations

In [None]:
bins = 50
plts_grid = 5 

In [None]:
fig = plt.figure(figsize = (20, 140))
for idx, i in enumerate(X_train.columns):
    ax = fig.add_subplot(int(np.ceil(len(X_train.columns)/plts_grid)), plts_grid, idx+1)
    plt.hist(X_train[i], bins= bins , label='train')
    plt.hist(X_test[i], bins = bins, label='test')
    ax.set_title(i)
    plt.legend()

plt.text(9, -20000, 'Capition', size = 12)
plt.show()

In [None]:
fig = plt.figure(figsize = (20, 140))
for idx, i in enumerate(X_train.columns):    
    fig.add_subplot(int(np.ceil(len(X_train.columns)/plts_grid)), plts_grid, idx+1)
    ax  = sns.boxplot(x = X_train[i] )
    ax.set_title(i)   
plt.show()

## Missing values checking

# Here you can see that most rows with a target = 1, have missing values.

In [None]:
train_df['count_null'] = train_df.isna().sum(axis=1)

fig, (ax1, ax2) = plt.subplots(1, 2)

sns.countplot(x='count_null', hue = TARGET, data=train_df, ax =ax1 )  
sns.countplot(x='count_null', data=train_df, ax =ax2) 

ax1.legend(title='Target: ')
fig.suptitle('Distribution by count of missing values ​​in a row: ')
fig.set_figwidth(20) 

plt.show()


In [None]:
train_df['count_null'] = train_df.isna().sum(axis=1)
train_df['has_mis'] = train_df['count_null'] > 0 

fig, ax = plt.subplots(1)
sns.countplot(x='has_mis', hue = TARGET, data=train_df )  
ax.set_xlabel("Has missing values? ")
ax.legend( title= "Target ")
plt.show()

# Simple data preparing

Add a column with count of missing objects in the row

In [None]:
X_train['n_missing'] = X_train.isna().sum(axis=1)
X_test['n_missing'] = X_test.isna().sum(axis=1)

Use **SimpleImputer** to populate Nan-value in our dataset. 
It is important that fit_transform() returns **ndarray**, not pd.DataFrame, so later, to apply methods with pd.DataFrame, the object will need to be cast to this type.

In [None]:
columns_for_pred  = X_train.columns
columns_for_pred

In [None]:
si = impute.SimpleImputer()

X_train =  si.fit_transform(X_train)
X_test =  si.fit_transform(X_test)

Use **StandardScaler** to scale variables

In [None]:
scaler = preprocessing.StandardScaler()
scaler.fit(X_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=columns_for_pred)
X_test = pd.DataFrame(scaler.transform(X_test), columns=columns_for_pred)

In [None]:
print(f'Count of null-values: {X_train.isna().sum().sum()}')

# Logistic regression 

**LogisticRegressionCV** is a logistic regression that uses cross validation. 

In [None]:
simple_regressor = linear_model.LogisticRegressionCV(random_state=0)
simple_regressor.get_params()

In [None]:
%%time
simple_regressor.fit(X = X_train, y= y_train)

In [None]:
simple_regressor.get_params()

In [None]:
predicted_values = simple_regressor.predict(X_train)
predicted_proba = simple_regressor.predict_proba(X_train)[:,1]

In [None]:
print(f'confusion_matrix: {metrics.confusion_matrix(y_train, predicted_values)}')
print(f'accuracy_score: {metrics.accuracy_score(y_train, predicted_values)}')


print(f'recall_score: {metrics.recall_score(y_train, predicted_values)}')
print(f'precision_score: {metrics.precision_score(y_train, predicted_values)}')

print(f'f1_score: {metrics.f1_score(y_train, predicted_values)}')

print(f'roc_auc_score: {metrics.roc_auc_score(y_train, predicted_proba)}')

# Make Submission! 

In [None]:
predictes_for_test = simple_regressor.predict_proba(X_test)

In [None]:
submission = pd.DataFrame({'id': pd.read_csv(path_data+'/test.csv', usecols=['id'])['id'], 'claim': predictes_for_test[:,1]})
submission.to_csv( 'submission.csv' ,index = 0)