# Importing Librabies and Loading datasets

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Plot
import seaborn as sns
import matplotlib.pyplot as plt

# Feature Selection
from sklearn.feature_selection import SelectPercentile, f_classif

# Modelling
from sklearn.model_selection import train_test_split

# Regression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score
from xgboost import XGBRegressor

# Classification
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier

# Cross-Validation
from sklearn.model_selection import StratifiedKFold

In [None]:
submission_data = pd.read_csv('../input/tabular-playground-series-nov-2021/sample_submission.csv')
train_data = pd.read_csv('../input/tabular-playground-series-nov-2021/train.csv')
test_data = pd.read_csv('../input/tabular-playground-series-nov-2021/test.csv')

# Explore Data

In [None]:
train_data.head()

In [None]:
train_data.describe()

In [None]:
train_data.columns
# It appears there is a target column.

# Basic Data Check

Credits to https://www.kaggle.com/raahulsaxena/tps-nov-21-data-check-feature-analysis

Although this data doesn't contain any missing, duplicated, categorical variables etc. This notebook helped me a lot to understand the basics of data check.

## Handle missing values

In [None]:
# Missing values
missing_values_train = train_data.isna().any().sum()
print('Missing values in train data: {0}'.format(missing_values_train[missing_values_train > 0]))

missing_values_test = test_data.isna().any().sum()
print('Missing values in test data: {0}'.format(missing_values_test[missing_values_test > 0]))

## Handle duplicates

In [None]:
# Duplicates
duplicates_train = train_data.duplicated().sum()
print('Duplicates in train data: {0}'.format(duplicates_train))

duplicates_test = test_data.duplicated().sum()
print('Duplicates in test data: {0}'.format(duplicates_test))

## Categorical variables

In [None]:
categorical_train = train_data.nunique().sort_values(ascending=True)
print('Categorical variables in train data: \n{0}'.format(categorical_train))

categorical_test = test_data.nunique().sort_values(ascending=True)
print('Categorical variables in train data: \n{0}'.format(categorical_test))
# No categorical variable other than the `target`.

## Correlated variables

In [None]:
fig, ax = plt.subplots(1, 2)
sns.heatmap(train_data.corr(), ax=ax[0])
sns.heatmap(test_data.corr(), ax=ax[1])
fig.set_figheight(8)
fig.set_figwidth(16)
fig.show()
# Variables are not correlated

# Features

In [None]:
# Get train data without the target and ids
data = train_data.iloc[:, 1:-1].copy()
# Get the target
y = train_data.target.copy()

# It takes time to handle all of the data.
# So, I am using a smaller portion of the data
# while debugging/testing.
#data = train_data.iloc[0:50000, 1:-1].copy()
#y = train_data.target[0:50000].copy()

I am actually planing to make a feature analysis but for now keep it simple.

Credits to https://www.kaggle.com/markosthabit/tbs-november-naive-bayes

In [None]:
# Select features
print(f"Data shape before selection: {data.shape}")
FeatureSelection = SelectPercentile(score_func=f_classif, percentile=20)
selected = FeatureSelection.fit_transform(data, y)
print(f"Data shape after selection: {selected.shape}")

# Get the list of the selected features
selected_features = np.where(FeatureSelection.get_support())
print(f"Selected Features: {selected_features}") 

selected_features = [f'f{feature}' for feature in selected_features[0]]

In [None]:
# Since I am not sure about my selected features.
# Sometimes it is better not even using them.
def get_X(use_selected_features=True):
    if use_selected_features:
        return data[selected_features]
    return data

# Modelling

I am not sure which modelling approach will give the best results. So, why not try many of them?

Also, different models will generate different predictions. Since, we are also allowed to submit probabilities, both predictions which are binary (0, 1) in classification models and probabilities in regression models should be okay to submit. It is also possible using `predict_proba` instead of `predict` in classification models.

**I am trying what I have learned so far, so please comment if I am doing something wrong or weird :).**

> It actually takes hours to run all those models, so instead of running them everytime I will directly give the outputs from my previous runs.

In [None]:
# Break data into two pieces or normalize
# Gaussian Naive Bayes and Logistic Regression works with normalized data
def split_data(X, y, normalize=False):
    if normalize:
        scaler = StandardScaler()
        normalized = scaler.fit_transform(X.copy())
        return train_test_split(normalized, y, random_state=1)
    return train_test_split(X, y, random_state=1)

## Regression

In [None]:
def run_regression_algorithm(X, y, model, n, text, early_stopping_rounds = None):
    # It actually takes hours to run all those models, 
    # so instead of running them everytime I will directly
    # give the outputs from my previous runs.
    return

    # Split data
    train_X, val_X, train_y, val_y = split_data(X, y)
    # Fit model
    if early_stopping_rounds: # For XGBRegressor
        model.fit(train_X, train_y, early_stopping_rounds=early_stopping_rounds,
                  eval_set=[(val_X, val_y)], verbose=False)
    else:
        model.fit(train_X, train_y)
    # Make predictions
    predictions = model.predict(val_X)
    # Get AUC
    auc = roc_auc_score(val_y, predictions)
    # Print the error
    print('{0}{1}AUC:  {2}'.format(text, n, auc))

### Decision Tree Regressor

In [None]:
# Compare with different values of max_leaf_nodes
for max_leaf_nodes in [5, 50, 500, 5000]:
    run_regression_algorithm(get_X(), y,
                             DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0),
                             '{0}  \t\t '.format(max_leaf_nodes), 'Max leaf nodes: ')

```
Max leaf nodes: 5           AUC:  0.58807
Max leaf nodes: 50          AUC:  0.62875
Max leaf nodes: 500         AUC:  0.64594
Max leaf nodes: 5000        AUC:  0.63100
```

### Random Forest Regressor

In [None]:
model = RandomForestRegressor(random_state=1)
run_regression_algorithm(get_X(), y, model, '', '')

```
AUC:  0.70127
```

### XGBoost XGBRegressor

In [None]:
# Compare with different values of n_estimators
for n_estimators in range(100, 1000, 100):
    run_regression_algorithm(get_X(), y,
                             XGBRegressor(n_estimators=n_estimators, learning_rate=0.05, n_jobs=4),
                             '{0}  \t\t '.format(n_estimators), 'N estimators: ', 5)

```
N estimators: 100  		 AUC:  0.71377
N estimators: 200  		 AUC:  0.71857
N estimators: 300  		 AUC:  0.71919
N estimators: 400  		 AUC:  0.71921
N estimators: 500  		 AUC:  0.71921
N estimators: 600  		 AUC:  0.71921
N estimators: 700  		 AUC:  0.71921
N estimators: 800  		 AUC:  0.71921
N estimators: 900  		 AUC:  0.71921
```

## Regression with Cross-Validation

In [None]:
# Cross-validation, https://www.kaggle.com/hamzaghanmi/make-it-simple/notebook
def run_regression_algoritm_with_cross_validation(X, y, model, early_stopping_rounds = None):
    # It actually takes hours to run all those models, 
    # so instead of running them everytime I will directly
    # give the outputs from my previous runs.
    return

    fold = 1
    skf = StratifiedKFold(n_splits=15, random_state=48, shuffle=True)
    for train_idx, test_idx in skf.split(X, y):
        train_X, val_X = X.iloc[train_idx], X.iloc[test_idx]
        train_y, val_y = y.iloc[train_idx], y.iloc[test_idx]

        # Fit model
        if early_stopping_rounds: # For XGBRegressor
            model.fit(train_X, train_y, early_stopping_rounds=early_stopping_rounds,
                      eval_set=[(val_X, val_y)], verbose=False)
        else:
            model.fit(train_X, train_y)
        # Make predictions
        predictions = model.predict(val_X)
        # Get AUC
        auc = roc_auc_score(val_y, predictions)
        # Print the AUC
        print("Fold: %d  \t\t AUC:  %f" %(fold, auc))
        fold += 1

### Decision Tree Regressor

In [None]:
model = DecisionTreeRegressor(max_leaf_nodes=500, random_state=0)
run_regression_algoritm_with_cross_validation(get_X(), y, model)

```
Fold: 1  		 AUC:  0.648110
Fold: 2  		 AUC:  0.645001
Fold: 3  		 AUC:  0.646716
Fold: 4  		 AUC:  0.651258
Fold: 5  		 AUC:  0.650062
Fold: 6  		 AUC:  0.650459
Fold: 7  		 AUC:  0.646524
Fold: 8  		 AUC:  0.652755
Fold: 9  		 AUC:  0.652514
Fold: 10 		 AUC:  0.647052
Fold: 11 		 AUC:  0.650992
Fold: 12 		 AUC:  0.646726
Fold: 13 		 AUC:  0.644246
Fold: 14 		 AUC:  0.648659
Fold: 15 		 AUC:  0.645532
```

### Random Forest Regressor

In [None]:
model = RandomForestRegressor(random_state=1)
run_regression_algoritm_with_cross_validation(get_X(), y, model)

```
Fold: 1  		 AUC:  0.698372
Fold: 2  		 AUC:  0.699659
Fold: 3  		 AUC:  0.701081
Fold: 4  		 AUC:  0.705221
Fold: 5  		 AUC:  0.703162
Fold: 6  		 AUC:  0.702732
Fold: 7  		 AUC:  0.698388
Fold: 8  		 AUC:  0.704422
Fold: 9  		 AUC:  0.702508
Fold: 10 		 AUC:  0.699164
Fold: 11 		 AUC:  0.703322
Fold: 12 		 AUC:  0.699437
Fold: 13 		 AUC:  0.698628
Fold: 14 		 AUC:  0.703577
Fold: 15 		 AUC:  0.697842
```

### XGBoost XGBRegressor

In [None]:
model = XGBRegressor(n_estimators=500, learning_rate=0.05, n_jobs=4)
run_regression_algoritm_with_cross_validation(get_X(), y, model, 5)

```
Fold: 1  		 AUC:  0.715399
Fold: 2  		 AUC:  0.716105
Fold: 3  		 AUC:  0.714442
Fold: 4  		 AUC:  0.723253
Fold: 5  		 AUC:  0.719537
Fold: 6  		 AUC:  0.720576
Fold: 7  		 AUC:  0.716693
Fold: 8  		 AUC:  0.722241
Fold: 9  		 AUC:  0.720729
Fold: 10 		 AUC:  0.719843
Fold: 11 		 AUC:  0.721102
Fold: 12 		 AUC:  0.716606
Fold: 13 		 AUC:  0.718105
Fold: 14 		 AUC:  0.721361
Fold: 15 		 AUC:  0.716609
```

## Classification

In [None]:
def run_classification_algoritm(X, y, model, normalize):
    # It actually takes hours to run all those models, 
    # so instead of running them everytime I will directly
    # give the outputs from my previous runs.
    return
    
    # Split data
    train_X, val_X, train_y, val_y = split_data(X, y, normalize)
    # Fit model
    model.fit(train_X, train_y)
    # Make predictions
    predictions = model.predict(val_X)
    # Get the accuracy score
    score = accuracy_score(val_y, predictions)
    # Print the accuracy
    print("Accuracy score:  %f" %(score))

### Gaussian Naive Bayes

In [None]:
# Credits to https://www.kaggle.com/markosthabit/tbs-november-naive-bayes
# https://iq.opengenus.org/gaussian-naive-bayes/
#     Gaussian Naive Bayes is a variant of Naive Bayes that follows
#     Gaussian normal distribution and supports continuous data.
# So, normalize the data and actually try both ways to get the difference.  
model = GaussianNB()
run_classification_algoritm(get_X(), y, model, True)
run_classification_algoritm(get_X(), y, model, False)

```
Accuracy score:  0.676300
Accuracy score:  0.676300
```

### XGBoost XGBClassifier

In [None]:
# Credits to https://www.kaggle.com/sugamkhetrapal/tps-nov-2021-1-06-xgboost/notebook
model = XGBClassifier(max_depth=1, subsample=0.5, colsample_bytree=0.5, eval_metric='error', use_label_encoder=False, random_state=1)
run_classification_algoritm(get_X(), y, model, False)

```
Accuracy score:  0.672873
```

### Logistic Regression

In [None]:
# Credits to https://www.kaggle.com/hamzaghanmi/make-it-simple
# https://kambria.io/blog/logistic-regression-for-machine-learning/
model = LogisticRegression(solver='liblinear')
run_classification_algoritm(get_X(False), y, model, True)
run_classification_algoritm(get_X(False), y, model, False)

```
Accuracy score:  0.736847
Accuracy score:  0.736813
```

## Classification with Cross-Validation

In [None]:
# Cross-validation, https://www.kaggle.com/hamzaghanmi/make-it-simple/notebook
def run_classification_algoritm_with_cross_validation(X, y, model, normalize):
    # It actually takes hours to run all those models, 
    # so instead of running them everytime I will directly
    # give the outputs from my previous runs.
    return

    X = np.array(X)
    # Apply standard scaler
    if normalize:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    fold = 1
    skf = StratifiedKFold(n_splits=15, random_state=48, shuffle=True)
    for train_idx, test_idx in skf.split(X, y):
        train_X, val_X = X[train_idx], X[test_idx]
        train_y, val_y = y.iloc[train_idx], y.iloc[test_idx]

        # Fit the model
        model.fit(train_X, train_y)
        # Make predictions
        predictions = model.predict_proba(val_X)[:,1]
        # Get AUC
        auc = roc_auc_score(val_y, predictions)
        # Print the AUC
        print("Fold: %d  \t\t AUC:  %f" %(fold, auc))
        fold += 1

### Gaussian Naive Bayes

In [None]:
model = GaussianNB()
run_classification_algoritm_with_cross_validation(get_X(), y, model, True)

```
Fold: 1  		 AUC:  0.709428
Fold: 2  		 AUC:  0.709001
Fold: 3  		 AUC:  0.710660
Fold: 4  		 AUC:  0.714958
Fold: 5  		 AUC:  0.712237
Fold: 6  		 AUC:  0.715526
Fold: 7  		 AUC:  0.708318
Fold: 8  		 AUC:  0.714901
Fold: 9  		 AUC:  0.714512
Fold: 10 		 AUC:  0.711692
Fold: 11 		 AUC:  0.715017
Fold: 12 		 AUC:  0.708338
Fold: 13 		 AUC:  0.710486
Fold: 14 		 AUC:  0.713657
Fold: 15 		 AUC:  0.712419
```

### XGBoost XGBClassifier

In [None]:
model = XGBClassifier(max_depth=1, subsample=0.5, colsample_bytree=0.5, eval_metric='error', use_label_encoder=False, random_state=1)
run_classification_algoritm_with_cross_validation(get_X(), y, model, False)

```
Fold: 1  		 AUC:  0.710758
Fold: 2  		 AUC:  0.711012
Fold: 3  		 AUC:  0.710687
Fold: 4  		 AUC:  0.717570
Fold: 5  		 AUC:  0.714278
Fold: 6  		 AUC:  0.715372
Fold: 7  		 AUC:  0.712060
Fold: 8  		 AUC:  0.717004
Fold: 9  		 AUC:  0.716060
Fold: 10 		 AUC:  0.714406
Fold: 11 		 AUC:  0.716232
Fold: 12 		 AUC:  0.710764
Fold: 13 		 AUC:  0.711267
Fold: 14 		 AUC:  0.715551
Fold: 15 		 AUC:  0.710048
```

### Logistic Regression

In [None]:
model = LogisticRegression(solver='liblinear')
run_classification_algoritm_with_cross_validation(get_X(False), y, model, True)

```
Fold: 1  		 AUC:  0.748223
Fold: 2  		 AUC:  0.747558
Fold: 3  		 AUC:  0.744476
Fold: 4  		 AUC:  0.751129
Fold: 5  		 AUC:  0.750191
Fold: 6  		 AUC:  0.750874
Fold: 7  		 AUC:  0.746259
Fold: 8  		 AUC:  0.750019
Fold: 9  		 AUC:  0.751094
Fold: 10 		 AUC:  0.749236
Fold: 11 		 AUC:  0.750499
Fold: 12 		 AUC:  0.746850
Fold: 13 		 AUC:  0.747999
Fold: 14 		 AUC:  0.753662
Fold: 15 		 AUC:  0.747682
```

## Final model

So far so good simplest solution gives the best results. So I have picked the `LogisticRegression` as the final model.

Credits to https://www.kaggle.com/hamzaghanmi/make-it-simple/notebook

In [None]:
# Create X, do not use features for now.
X = get_X(False)

# Create test X, drop ids.
# For now create it without selected features
# It gives better results this way.
test_X = test_data.iloc[:, 1:]

# Apply standard scaler
scaler = StandardScaler()
X = scaler.fit_transform(X)
test_X = scaler.transform(test_X)

# Create the model
model = LogisticRegression(solver='liblinear')

# Cross-validation, https://www.kaggle.com/hamzaghanmi/make-it-simple/notebook
fold = 1
test_predictions = np.zeros(test_X.shape[0])
skf = StratifiedKFold(n_splits=15, random_state=48, shuffle=True)
for train_idx, test_idx in skf.split(X, y):
    train_X, val_X = X[train_idx], X[test_idx]
    train_y, val_y = y.iloc[train_idx], y.iloc[test_idx]

    # Fit the model
    model.fit(train_X, train_y)
    # Make predictions
    predictions = model.predict_proba(val_X)[:,1]
    # Get AUC
    auc = roc_auc_score(val_y, predictions)
    # Print the error
    print("Fold: %d  \t\t AUC:  %f" %(fold, auc))

    # Make predictions, use probability
    test_predictions += model.predict_proba(test_X)[:,1] / skf.n_splits
    fold += 1

# Submission

In [None]:
# Run the code to save predictions in the format used for competition scoring
output = pd.DataFrame({'id': test_data.id, 'target': test_predictions})
output.to_csv('submission.csv', index=False)

In [None]:
output