# Libraries

In [None]:
!pip install --upgrade scikit-learn

In [None]:
import sklearn

In [None]:
print(sklearn.__version__)

In [None]:
import multiprocessing
import pandas as pd 
import numpy as np

#preprocessing 
from sklearn.model_selection import train_test_split, cross_val_score,StratifiedKFold

#scaling 
from sklearn.preprocessing import StandardScaler,MinMaxScaler

# metrics 
from sklearn.metrics import roc_auc_score, classification_report

#models 
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier

## hyperparameter tuning
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

from keras.wrappers.scikit_learn import KerasClassifier

from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense,Dropout
from tensorflow.keras.callbacks import EarlyStopping 
from tensorflow.keras.metrics import AUC

import lightgbm as lgb
import xgboost as xgb

from sklearn.ensemble import AdaBoostClassifier,RandomForestClassifier, VotingClassifier

# viz 
import seaborn as sns
import matplotlib.pyplot as plt 

# Quick EDA 

In [None]:
# remove the Id as its not needed
train = pd.read_csv("../input/tabular-playground-series-nov-2021/train.csv",index_col=0)
test = pd.read_csv("../input/tabular-playground-series-nov-2021/test.csv",index_col=0)

In [None]:
train.shape

In [None]:
train.isnull().sum().sort_values(ascending = False)

In [None]:
train.describe(exclude="float64")

In [None]:
train["target"].value_counts().plot(kind = "bar")

In [None]:
#check for duplicates
train[train.duplicated()]

# Feature Engineering 

Idea from this kernel 
https://www.kaggle.com/christoforum/tps-nov-2021-lightgbm-optuna/comments

In [None]:
train['sum'] = train.sum(axis = 1)
train['mean'] = train.mean(axis = 1)
train['std'] = train.std(axis = 1)
train['min'] = train.min(axis = 1)
train['max'] = train.max(axis = 1)

test['sum'] = test.sum(axis = 1)
test['mean'] = test.mean(axis = 1)
test['std'] = test.std(axis = 1)
test['min'] = test.min(axis = 1)
test['max'] = test.max(axis = 1)

# Train test split

In [None]:
# split into target and values 
X= train.drop("target", axis =1 )
y = train["target"]

In [None]:
X.head()

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

# Scaling:
Dependent on model used

* Standardisation will be used for general models
* MinMaxscaler will be used if needed

In [None]:
std_scaler = StandardScaler()
minmax_scaler = MinMaxScaler()

In [None]:
"""minmax_scaler.fit(X_train)
X_train_m = minmax_scaler.transform(X_train)
X_test_m = minmax_scaler.transform(X_test)"""

std_scaler.fit(X_train)
X_train= std_scaler.transform(X_train)
X_test = std_scaler.transform(X_test)

# Base model selection with Cross Validation 
We will try to identify the best model to use by apply the default model paramaters to the training and validation data and scoring their AUC results \
Cross validation score will also be used to check we arent overfitting 

**Hyperparameter tuning:** Tuning take a long time to optimize a with Kaggles limitation of 9hrs runtime we will therefore use the default hyperparameters unless we find obvious optimizations

In [None]:
param = {'metric': "auc"}

### Instantiate models

In [None]:
# Linear Models 
lr = LogisticRegression()
svc = LinearSVC(max_iter=4000)  # used linearSVC as this is faster than SVC()
ridge = RidgeClassifier()
knn = KNeighborsClassifier()

# Trees & boosting 
# dtree = DecisionTreeClassifier()  -wont use as we have enough trees
rf = RandomForestClassifier()

lgb_i = lgb.LGBMClassifier()
ada_i = AdaBoostClassifier()    # takes long 
xgb_i = xgb.XGBClassifier(n_estimators =1000)

In [None]:
## Cross validaition 
def model_scoring_CV(model):
    #cv = StratifiedKFold(n_splits=3)
    cross_val = cross_val_score(model,X_train,y_train,cv =5,scoring="roc_auc")
    print("mean CV roc_auc",cross_val.mean())
    
# Basic model
def model_scoring(model):
    
    model.fit(X_train,y_train)
    
    y_pred = model.predict(X_test)

    print("Training score:",model.score(X_train,y_train))  
    print("Test auc score", roc_auc_score(y_test, y_pred))
    print("\n")
    print(classification_report(y_test, model.predict(X_test)))
    
    return model

# Linear Models

In [None]:
# Logistic Regression 
#model_scoring_CV(lr)

lr_model = model_scoring(lr)

In [None]:
# Support Vector Classifier 
#model_scoring_CV(svc)  # has convergence warnings

SVC_model = model_scoring(svc) 

**Unbelievable score**, this looks to be the best model however after submission using only SVC we got +-55% AUC \
This count indicate that :
1. we have leakage - however we are using Cross validation so this doesnt seem to be the case  or 
2. Our training data is biased to linear modelling and/or doesnt represent the full dataset.

In [None]:
#model_scoring_CV(ridge)

ridge_model = model_scoring(ridge)

# Trees and Boosting 
### Due to the time taken to run tree models we can use either:
* optimized methods built into the algorithm (i.e. XGboost with Dmatrices) 
* HalvingSearchCSV  - experimental sklearn package that applies successive halving of the data (essentially reduces run time) 

## XGBoost

In [None]:
xgb_i.fit(X_train,
          y_train,
          eval_metric= ["auc"],
          early_stopping_rounds=10,
          eval_set= [(X_test, y_test)]
         )
y_pred = xgb_i.predict(X_test)

In [None]:
print("Training score:",xgb_i.score(X_train,y_train))  
print("Test auc score", roc_auc_score(y_test, y_pred))
print("\n")
print(classification_report(y_test, xgb_i.predict(X_test)))

## Halving GridSearch CV 
Due to the number of observations & features, certain models take to long to converge / complete. 

We will use an experimental package, **HalvingGridSearchCV**, which applies successive halving. \
As per the [documentation](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.HalvingGridSearchCV.html?highlight=halvinggrid#sklearn.model_selection.HalvingGridSearchCV):

*The search strategy starts evaluating all the candidates with a small amount of resources and iteratively selects the best candidates, using more and more resources*

In [None]:
"""param_xgb = {"eta": [0, 0.001,0.01,0.1,1],
             'max_depth': [2, 4, 6],
             'eval_metric' : ['auc'],
             'n_estimators': [xgb_model.best_iteration]         
            }"""

param = {"eval_metric" : ["auc"]}

In [None]:
def Halving_CV(model, param):
    #cv = StratifiedKFold(n_splits=3)
    clf = HalvingGridSearchCV(model, param, cv=5 ,scoring= "roc_auc")

    clf.fit(X_train, y_train)
    print("Best score:",clf.score(X_train,y_train))  
    print("train auc score", roc_auc_score(y_train, clf.predict(X_train)))
    print("Test auc score", roc_auc_score(y_test, clf.predict(X_test)))
    print(classification_report(y_test, clf.predict(X_test)))
    
    return clf 

In [None]:
rf_model = Halving_CV(rf,{
    "max_depth":[8],
    "max_features":[80]
})

In [None]:
#ada_model = Halving_CV(ada_i,{})

# Light GBM
lets apply light GBM again using its optimized dataset and with evaluation data + early stopping 

In [None]:
param_lgb = {'metric': "auc",
        "learning_rate" : 0.01,
        #"boosting": "dart",
            "max_depth" : 10,
            "num_leaves": 30}

In [None]:
"""lgb_train = lgb.Dataset(X_train, label= y_train)
lgb_test = lgb.Dataset(X_test, label=y_test)

lgb_model = lgb.train(params= param_lgb, train_set=lgb_train, valid_sets=[lgb_test], early_stopping_rounds=10, num_boost_round = 10000)"""

In [None]:
"""lgb_model.best_score"""

In [None]:
## Round probabilities
#### lgb can only predict on the raw data (not the lgb transformed data)
"""lgb_pred_train = (lgb_model.predict(X_train)>0.5).astype("int32")
lgb_pred = (lgb_model.predict(X_test)>0.5).astype("int32")"""

In [None]:
"""print("Train auc score", roc_auc_score(y_train, lgb_pred_train))
print("Test auc score", roc_auc_score(y_test, lgb_pred))
print(classification_report(y_test, lgb_pred))"""

## Using the .Fit call (not .train)
We use .fit as this is comparible with Voting Classifier 

In [None]:
lgb_i = lgb.LGBMClassifier(learning_rate=0.01, max_depth=10,num_leaves = 30, n_estimators= 10000)
lgb_i.fit(X_train,y_train, eval_metric = "auc",eval_set = [(X_test,y_test)] , early_stopping_rounds=10)

In [None]:
lgb_pred_train = lgb_i.predict(X_train)
lgb_pred = lgb_i.predict(X_test)

In [None]:
print("Train auc score", roc_auc_score(y_train, lgb_pred_train))
print("Test auc score", roc_auc_score(y_test, lgb_pred))
print(classification_report(y_test, lgb_pred))

# K-Nearest Neightbors

In [None]:
#knn_model = Halving_CV(knn,{})

Time of KNN is too long - removed as auc was tested in isolation and was very low

# Deep Learning (ANN)

In [None]:
def create_model():   
    deep_model = Sequential()
    deep_model.add(Dense(100, activation = "relu"))
    deep_model.add(Dropout(0.5))

    deep_model.add(Dense(50, activation = "relu"))
    deep_model.add(Dropout(0.5))

    deep_model.add(Dense(20, activation = "relu"))
    deep_model.add(Dropout(0.5))

    deep_model.add(Dense(1, activation = "sigmoid"))

    deep_model.compile(optimizer="adam", loss="binary_crossentropy", metrics= [AUC()])
    return deep_model

In [None]:
deep_model = create_model()
deep_model.fit(X_train, y_train, epochs = 2000, validation_data =(X_test, y_test) , batch_size = 128,callbacks=EarlyStopping(patience=20),use_multiprocessing=True)

In [None]:
history = deep_model.history.history
history = pd.DataFrame(history)

y_deep =(deep_model.predict(X_test) > 0.5).astype("int32")

print("Best score:",deep_model.evaluate(X_train,y_train))  
print("train auc score", roc_auc_score(y_train, (deep_model.predict(X_train) > 0.5).astype("int32")))
print("Test auc score", roc_auc_score(y_test,y_deep))
print(classification_report(y_test, y_deep))

In [None]:
history.plot()

### Keras doesnt interact with Voting Classifier 
We therefore need to use the keras sklearn wrapper to make the model compatible

In [None]:
ann_model  = KerasClassifier(build_fn=create_model, epochs = 2000, validation_data =(X_test, y_test) , batch_size = 128,callbacks=EarlyStopping(patience=10))
ann_model._estimator_type = "classifier"

# Voting Classifier 
We can now create a ensemble model with a few of our trained models \
We will do something interesting here and try ensembel our models into TREES ,  LINEAR and ANN
note: we could have merged ANN and Linear however ANN only uses predict probabilties and LinearSVC only has exclusive classication outputs

**Type of voting used:**
*If ‘hard’, uses predicted class labels for ***majority*** rule voting. Else if ‘soft’, predicts the class label based on the argmax of the sums of the predicted probabilities, which is recommended for an ensemble of well-calibrated classifiers*

We will use "hard" as our models arent optimized

## Linear - hard voting
Hard voting as this will give us the best output and LinearSVC doesnt have a "predict_proba" method, so cant be used for Soft

In [None]:
vc_linear = VotingClassifier(
    estimators=[
        ('lr', lr_model),
        ("svc",SVC_model),
        ("ridge",ridge_model),
        #('ann', ann_model)
    ], 
    voting='hard')

In [None]:
vc_linear.fit(X_train,y_train)

In [None]:
y_vc = vc_linear.predict(X_test)

print("Best score:",vc_linear.score(X_train,y_train))
print("train auc score", roc_auc_score(y_train, vc_linear.predict(X_train)))
print("Test auc score", roc_auc_score(y_test,y_vc))
print(classification_report(y_test, y_vc))

## Tree Voting - hard voting

In [None]:
vc_tree = VotingClassifier(
    estimators=[
        ("lgb",lgb_i),
        ("xgb",xgb_i),
        ("rf",rf_model),
       # ("ada",ada_model)
    ], 
    voting='hard')

In [None]:
vc_tree.fit(X_train,y_train)

In [None]:
y_vc = vc.predict(X_test)

print("Best score:",vc.score(X_train,y_train))
print("train auc score", roc_auc_score(y_train, vc_tree.predict(X_train)))
print("Test auc score", roc_auc_score(y_test,y_vc))
print(classification_report(y_test, y_vc))

# Submissions

In [None]:
# standard scaling
s_test = std_scaler.transform(test)

### Linear

In [None]:
lin_pred = vc_linear.predict(s_test)

sub = pd.DataFrame(lin_pred, columns=["target"])
sub.set_index(test.index,inplace=True)

sub.to_csv("submission_linear.csv")

In [None]:
sub.sample(10)

### Tree 

In [None]:
tree_pred = vc_tree.predict(s_test)

sub = pd.DataFrame(tree_pred, columns=["target"])
sub.set_index(test.index,inplace=True)

sub.to_csv("submission_tree.csv")

In [None]:
sub.sample(10)

### ANN

In [None]:
ann_pred = vc_linear.predict(s_test)

sub = pd.DataFrame(ann_pred, columns=["target"])
sub.set_index(test.index,inplace=True)

sub.to_csv("submission_ann.csv")

In [None]:
sub.sample(10)