In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Read train dataset

In [None]:
datasetpath = '/kaggle/input/dont-overfit-ii/'

df = pd.read_csv(os.path.join(datasetpath, 'train.csv'))

print("The shape of the dataset is {}.".format(df.shape))

In [None]:
df.head()

### Show some statistics about the data

In [None]:
df.describe()

In [None]:
print(df.isnull().sum())
print("sum of sum null columns:", df.isnull().sum().sum())

In [None]:
df.nunique()

In [None]:
df.var().sort_values()

### Training of Logistic Model
using logistic regression for less complexity as other models tends to be more complex. As to overcome overfitting we need to use less complex models. also, I have checked along side other classifier models and best result was logistic regression.<br>
**Note**: solver = ‘liblinear’ as: For small datasets, ‘liblinear’ is a good choice, whereas ‘sag’ and ‘saga’ are faster for large ones.<br>
[logisticRegressionDocumentation](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

1. splitting data using only 80% for training and 20% for validation.

In [None]:
from sklearn.model_selection import train_test_split

X = df.drop(columns=['id', 'target'])
y = df['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

LogReg = LogisticRegression(solver='liblinear')

LogReg.fit(X_train, y_train)

pred = LogReg.predict(X_val)

print("Validation AUCROC score: {:.5f}".format(roc_auc_score(y_val, pred)))
print("\nTrain AUCROC score: {:.5f}".format(roc_auc_score(y_train, LogReg.predict(X_train))))

2. spliting dataset using **StratifiedKFold** of ``10 splits`` for more accurate score. 

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

score = cross_val_score(LogisticRegression(solver='liblinear'), X, y,
                        scoring='roc_auc', cv=skf)

print('Logistic regression model using cv = 10 folds')
#print('AUCROC scores: ', score)
print('AUCROC mean(std): {:.5f}({:.5f}) '.format(score.mean(), score.std()))

3. Tuning logistic regression model using **GridSearchCV** across different values of penalty and C where ``penalty = ['l1', 'l2'] ,and C = np.linspace(0.001,0.3, 1000)``

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold


model = LogisticRegression(solver='liblinear')
C=np.linspace(0.001,0.3, 1000)
penalty = ['l1','l2'] 

param_grid = dict(penalty = penalty, C = C)
#print(param_grid)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

grid_search = GridSearchCV(model, param_grid, scoring="roc_auc", cv=skf, n_jobs=-1, verbose=1, return_train_score=True)

grid_result = grid_search.fit(X, y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
    
results_log = pd.DataFrame(grid_search.cv_results_)
results_log.head()

Reviewing test and train score of GridSearchCV for best parameters.

In [None]:
print("reviewing test and train score.")
results_log[results_log["params"]==grid_result.best_params_].loc[:,["mean_test_score","std_test_score","mean_train_score" ,"std_train_score"]].head()

### Read test dataset

In [None]:
datasetpath = '/kaggle/input/dont-overfit-ii/'

df_test = pd.read_csv(os.path.join(datasetpath, 'test.csv'))

print("The shape of the dataset is {}.".format(df_test.shape))

In [None]:
df_test.head()

In [None]:
df_test.describe()

Check for **Nans** in test dataset

In [None]:
print(df_test.isnull().sum())
print("sum of sum null columns:", df_test.isnull().sum().sum())

### Predicting 'target' in test data

In [None]:
X_test = df_test.drop(columns='id')

y_test_predicted = grid_search.predict(X_test)

df_test['target'] = y_test_predicted

df_test.head()

In [None]:
df_test[['id', 'target']].to_csv('submission.csv', index=False)