In [None]:
import gc

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sn

from pprint import pprint

from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score, plot_roc_curve
from sklearn.model_selection import RandomizedSearchCV, train_test_split

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import SGDClassifier

# Reading the dataset

In [None]:
path='/kaggle/input/tabular-playground-series-sep-2021/'

train = pd.read_csv(path+'train.csv')
test = pd.read_csv(path+'test.csv')

In [None]:
print('train shape:', train.shape)
print('test shape:', test.shape)

In [None]:
train.head()

In [None]:
test.head()

# Handling missing values

In [None]:
id = test['id']
y = train['claim']
train = train.drop(['id','claim'], axis=1)
test = test.drop(['id'], axis=1)

Some feature engineering

In [None]:
train["max_value"] = train.max(axis = 1)
train["min_value"] = train.min(axis = 1 )
train['num_missing_std'] = train.isna().std(axis=1).astype('float') 
train["mean"] = train.mean(axis = 1)
train["median"] = train.median(axis = 1)
train["std"] = train.std(axis = 1)
train['mad'] = train.mad(axis=1) 
train["skew"] = train.skew(axis = 1)
train["null_value"] = train.isnull().sum(axis = 1)

test["max_value"] = test.max(axis = 1)
test["min_value"] = test.min(axis = 1 )
test['num_missing_std'] = test.isna().std(axis=1).astype('float') 
test["mean"] = test.mean(axis = 1)
test["median"] = test.median(axis = 1)
test["std"] = test.std(axis = 1)
test['mad'] = test.mad(axis=1) 
test["skew"] = test.skew(axis = 1)
test["null_value"] = test.isnull().sum(axis = 1)

Splitting data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train, y, test_size=0.1, random_state=42)

Transforming data

In [None]:
numerical_transformer = SimpleImputer(strategy='constant', fill_value=0)

imputed_train = pd.DataFrame(numerical_transformer.fit_transform(X_train))
imputed_test = pd.DataFrame(numerical_transformer.transform(X_test))
imputed_train.columns = train.columns
imputed_test.columns = train.columns

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(imputed_train)
X_test = scaler.transform(imputed_test)

In [None]:
del imputed_train, imputed_test, train
gc.collect()

# SGD Classifier

In [None]:
sgd = SGDClassifier()

**Creating random grid**

In [None]:
penalty = ['l1']
alpha = [1e-4, 1e-3, 1e-2, 1e-1, 1e0]
max_iter = [int(x) for x in np.linspace(1000, 10000, num = 19)]
loss = ['hinge', 'log', 'modified_huber', 'squared_hinge',  
'perceptron'] 
learning_rate = ['constant', 'optimal', 'invscaling', 'adaptive'] 
class_weight = [{1:0.5, 0:0.5}, {1:0.4, 0:0.6}, {1:0.6, 0:0.4}, {1:0.7, 0:0.3}] 
eta0 = [1, 10, 100]


random_grid = {
    'penalty' : penalty,
    'alpha' : alpha,
    'max_iter' : max_iter,
    'loss': loss,
    'learning_rate' : learning_rate,
    'class_weight' : class_weight,
    'eta0' : eta0
}


pprint(random_grid)

In [None]:
clf = RandomizedSearchCV(estimator = sgd, param_distributions = random_grid, n_iter = 10, cv = 3, verbose = 2, random_state = 42, n_jobs = -1, scoring = 'roc_auc')

In [None]:
%%time
clf.fit(X_train, y_train)

In [None]:
del y_train
gc.collect()

In [None]:
print('Best Score: ', clf.best_score_) 
print('Best Params: ', clf.best_params_)

In [None]:
pred = clf.predict(X_test)

# **Measuring** **performance**

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pred)
print(auc(fpr, tpr))

plot_roc_curve(clf, X_test, y_test)
plt.show()

In [None]:
del y_test, X_test, X_train
gc.collect()

# Exporting predictions to appropriate submission format

In [None]:
imputed_test_df = pd.DataFrame(numerical_transformer.transform(test))
imputed_test_df.columns = imputed_test_df.columns 
test_df =  scaler.transform(imputed_test_df)
y_predicted = clf.predict(test_df)

In [None]:
del test_df, test
gc.collect()

In [None]:
data = {'id' : id, "claim" : y_predicted}
df = pd.DataFrame(data=data)
df.head()

In [None]:
df.to_csv('submission.csv', index=False)
df