In [None]:
import eli5
import numpy as np
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.tools as tls

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, cross_val_score, GridSearchCV, RepeatedStratifiedKFold

from eli5.sklearn import PermutationImportance
from sklearn.preprocessing import PolynomialFeatures
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from mlxtend.plotting import plot_sequential_feature_selection as plot_sfs
from sklearn.feature_selection import GenericUnivariateSelect, SelectPercentile, SelectKBest, f_classif, RFE, mutual_info_classif


from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier




In [None]:
train = pd.read_csv('/kaggle/input/dont-overfit-ii/train.csv')
test = pd.read_csv('/kaggle/input/dont-overfit-ii/test.csv')

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000):
        with pd.option_context("display.max_columns", 1000):
            display(df)
            
def reduce_memory_usage(df, verbose=True):
    """ 
    Reduces the size of given dataframe by assigning 
    datatype appropriately.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_max = df[col].max()
            c_min = df[col].min()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print(f'Mem. usage decreased to {end_mem} Mb {(((start_mem - end_mem)/start_mem))*100} % reduction.')
    return df
                    
def describe_table(df):
    """Describes the statistics of given dataframe"""
    print(f'Dataset Shape is: {df.shape}')
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary.rename(columns={'index':'Name'}, inplace=True)
    summary['Missing'] = df.isnull().sum().values
    summary['Uniques'] = df.nunique().values
    summary['First Values'] = df.loc[0].values
    summary['Second Values'] = df.loc[1].values
    summary['Third Values'] = df.loc[2].values
    display(summary)
    
    return summary

def train_model(X, X_test, y, params, folds, n_splits, model_type='lgb', plot_feature_importance=False, averaging='usual', model=None):
    oof = np.zeros(len(X))
    prediction = np.zeros(len(X_test))
    scores = []
    feature_importance = pd.DataFrame()
    for fold_n, (train_index, valid_index) in enumerate(folds.split(X, y)):
        # print('Fold', fold_n, 'started at', time.ctime())
        X_train, X_valid = X[train_index], X[valid_index]
        y_train, y_valid = y[train_index], y[valid_index]
        
        if model_type == 'lgb':
            train_data = lgb.Dataset(X_train, label=y_train)
            valid_data = lgb.Dataset(X_valid, label=y_valid)
            
            model = lgb.train(params,
                    train_data,
                    num_boost_round=2000,
                    valid_sets = [train_data, valid_data],
                    verbose_eval=500,
                    early_stopping_rounds = 200)
            
            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test, num_iteration=model.best_iteration_)
            
        if model_type == 'xgb':
            train_data = xgb.DMatrix(data=X_train, label=y_train, feature_names=X_tr.columns)
            valid_data = xgb.DMatrix(data=X_valid, label=y_valid, feature_names=X_tr.columns)

            watchlist = [(train_data, 'train'), (valid_data, 'valid_data')]
            model = xgb.train(dtrain=train_data, num_boost_round=20000, evals=watchlist, early_stopping_rounds=200, verbose_eval=500, params=params)
            y_pred_valid = model.predict(xgb.DMatrix(X_valid, feature_names=X_tr.columns), ntree_limit=model.best_ntree_limit)
            y_pred = model.predict(xgb.DMatrix(X_test, feature_names=X_tr.columns), ntree_limit=model.best_ntree_limit)
        
        if model_type == 'cat':
            model = CatBoostClassifier(iterations=20000,  eval_metric='AUC', **params)
            model.fit(X_train, y_train, eval_set=(X_valid, y_valid), cat_features=[], use_best_model=True, verbose=False)

            y_pred_valid = model.predict(X_valid)
            y_pred = model.predict(X_test)
            
            
        if model_type == 'sklearn':
            model = model
            model.fit(X_train, y_train)
            y_pred_valid = model.predict(X_valid).reshape(-1,)
            score = roc_auc_score(y_valid, y_pred_valid)
            # print(f'Fold {fold_n}. AUC: {score:.4f}.')
            # print('')
            
            y_pred = model.predict_proba(X_test)[:, 1]
            
        if model_type == 'glm':
            model = sm.GLM(y_train, X_train, family=sm.families.Binomial())
            model_results = model.fit()
            model_results.predict(X_test)
            y_pred_valid = model_results.predict(X_valid).reshape(-1,)
            score = roc_auc_score(y_valid, y_pred_valid)
            
            y_pred = model_results.predict(X_test)
            
        oof[valid_index] = y_pred_valid.reshape(-1,)
        scores.append(roc_auc_score(y_valid, y_pred_valid))
        
        if averaging == 'usual':
            prediction += y_pred
        elif averaging == 'rank':
            prediction += pd.Series(y_pred).rank().values  
        
        if model_type == 'lgb':
            # feature importance
            fold_importance = pd.DataFrame()
            fold_importance["feature"] = X.columns
            fold_importance["importance"] = model.feature_importances_
            fold_importance["fold"] = fold_n + 1
            feature_importance = pd.concat([feature_importance, fold_importance], axis=0)

    prediction /= n_splits
    
    print('CV mean score: {0:.4f}, std: {1:.4f}.'.format(np.mean(scores), np.std(scores)))
    
    if model_type == 'lgb':
        feature_importance["importance"] /= n_fold
        if plot_feature_importance:
            cols = feature_importance[["feature", "importance"]].groupby("feature").mean().sort_values(
                by="importance", ascending=False)[:50].index

            best_features = feature_importance.loc[feature_importance.feature.isin(cols)]

            plt.figure(figsize=(16, 12));
            sns.barplot(x="importance", y="feature", data=best_features.sort_values(by="importance", ascending=False));
            plt.title('LGB Features (avg over folds)');
        
            return oof, prediction, feature_importance
        return oof, prediction, scores
    else:
        return oof, prediction, scores
    
def get_correlation():
    """Get correlation between features"""
    corr_matrix = train.corr()
    corr = corr_matrix.abs().unstack().sort_values(kind="quicksort", ascending=False).reset_index()
    corr = corr[corr['level_0'] != corr['level_1']]
    return corr_matrix, corr

def standardized_features(x):
    """ Normalize the features. """
    sc = StandardScaler()
    x = sc.fit_transform(x)
    return x

def remove_columns_null(df, null_per=0.9, target=''):
    """Remove columns from dataframe which have null values greater than null_per"""
    drop_cols = [col for col in df if df[col].isnull().sum() / df.shape[0] > null_per]
    if target and target in drop_cols:
        drop_cols.remove(target)
    df.drop(columns=drop_cols, inplace=True)
    return df
    
def remove_columns_top_values(df, top_values_per=0.9, target=''):
    """Remove columns from dataframe which have top_values greater than top_values_per"""
    drop_cols = [col for col in df.columns if df[col].value_counts(dropna=False, normalize=True).values[0] > top_values_per]
    if target and target in drop_cols:
        drop_cols.remove(target)
    df.drop(columns=drop_cols, inplace=True)
    return df

def convert_categorical(df, target=''):
    """Convert categorical to labels"""
    for col in df.columns:
        if df[col].dtype=='object' and col != target:
            le = preprocessing.LabelEncoder()
            col_val = list(df[col].values)
            le.fit(col_val)
            df[col] = le.transform(col_val)
    return df

def pca(df, cols, n_components, prefix='PCA_', rand_seed=4):
    pca = PCA(n_components=n_components, random_state=rand_seed)
    principal_components = pca.fit_transform(df[cols])
    principal_df = pd.DataFrame(principal_components)
    df.drop(col, axis=1, inplace=True)
    principal_df.rename(columns=lambda x: str(prefix)+str(x), inplace=True)
    df = pd.concat([df, principal_df], axis=1)
    return df 

In [None]:
display_all(train.head().T)

In [None]:
des_tab = describe_table(train)

In [None]:
train['target'].value_counts()

In [None]:
corr_matrix, corr = get_correlation()
corr.head(10)

In [None]:
x_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
x_test = test.drop(['id'], axis=1)

#folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = RepeatedStratifiedKFold(n_splits=20, n_repeats=20, random_state=42)
x_train = standardized_features(x_train)
x_test = standardized_features(x_test)

model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1,solver='liblinear')
oof, prediction_lr, scores = train_model(x_train, x_test, y_train, params=None, model_type='sklearn',
                                         folds=folds, n_splits=20, model=model)


# **Eli5 Feature Importance**

In [None]:
eli5.explain_weights(model, top=50)
eli5_features = eli5.format_as_dataframe(eli5.explain_weights(model, top=100))

In [None]:
top_features = [feat[1:] for feat in eli5_features[eli5_features['feature'] != '<BIAS>']['feature']]
x_train = standardized_features(train[top_features])
x_test = standardized_features(test[top_features])

model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1,solver='liblinear')
oof, prediction_lr, scores = train_model(x_train, x_test, y_train, params=None, model_type='sklearn',
                                         folds=folds, n_splits=20, model=model)


# **Eli5 Permutation Importance**

In [None]:
model_perm = PermutationImportance(model, random_state=1).fit(x_train, y_train)
eli5.show_weights(model_perm, top=50)
eli5_features = eli5.format_as_dataframe(eli5.explain_weights(model, top=100))
top_features = [feat[1:] for feat in eli5_features[eli5_features['feature'] != '<BIAS>']['feature']]
x_train = standardized_features(train[top_features])
x_test = standardized_features(test[top_features])

model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1,solver='liblinear')
oof, prediction_lr, scores = train_model(x_train, x_test, y_train, params=None, model_type='sklearn',
                                         folds=folds, n_splits=20, model=model)


# **Mlxtend Sequential Feature Selector**

In [None]:
x_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
x_test = test.drop(['id'], axis=1)
x_train = standardized_features(x_train)
x_test = standardized_features(x_test)

model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1,solver='liblinear')
oof, prediction_lr, scores = train_model(x_train, x_test, y_train, params=None, model_type='sklearn',
                                         folds=folds, n_splits=20, model=model)



In [None]:
sfs = SFS(model,
    k_features=(10,15),
    forward=True,
    floating=False,
    verbose=2,
    scoring='roc_auc',
    cv=folds,
    n_jobs=-1)
sfs = sfs.fit(x_train, y_train)

In [None]:
print('Mlxtend Feature Names:-',sfs.k_feature_names_)


In [None]:
top_features = list(sfs.k_feature_names_)
x_train = standardized_features(train[top_features])
x_test = standardized_features(test[top_features])

kwargs = {'C':0.1,
        'class_weight':'balanced',
        'penalty':'l1', 
        'solver':'liblinear'}
#model = LogisticRegression(**kwargs)
model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')
oof_lr, prediction_lr, _ = train_model(x_train, x_test, y_train, params=None,
                                      model_type='sklearn', model=model, folds=folds,
                                       n_splits=20)


In [None]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['target'] = model.predict_proba(test[top_features])[:, 1]
submission.to_csv('lr_mlxtend.csv', index=False)

In [None]:
submission.head()

In [None]:
fig = plot_sfs(sfs.get_metric_dict(), kind='std_dev')

plt.ylim([0.8, 1])
plt.title('Sequential Forward Selection (StdDev)')
plt.grid()
plt.show()


# **Sklearn Feature Selection**

In [None]:
x_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
x_test = test.drop(['id'], axis=1)
x_train = standardized_features(x_train)
x_test = standardized_features(x_test)
folds = RepeatedStratifiedKFold(n_splits=20, n_repeats=20, random_state=42)
model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')

In [None]:
scores_lst = []
for i in range(10, 301, 5):
    rfe = RFE(model, i, step=1)
    x_train_rfe = rfe.fit_transform(x_train, y_train.values.astype(int))
    x_test_rfe = rfe.transform(x_test)
    oof, prediction, scores = train_model(x_train_rfe, x_test_rfe, y_train, params=None, model_type='sklearn',
                                         folds=folds, n_splits=20, model=model)
    scores_lst.append(np.mean(scores))

In [None]:
rfe = RFE(model, 20, step=1)
x_train_rfe = rfe.fit_transform(x_train, y_train.values.astype(int))
x_test_rfe = rfe.transform(x_test)
oof, prediction, scores = train_model(x_train_rfe, x_test_rfe, y_train, folds=folds, 
                                      params=None, model_type='sklearn', n_splits=20, model=model) 

# **Modeling**

In [None]:
def get_estimator(params, etype):
    """ Returns estimator based on given estimator type. """
    
    if etype == 'lr':
        model = LogisticRegression(**params)
    elif etype == 'knn':
        model = KNeighborsClassifier(**params)
    elif etype == 'gb':
        model = GaussianNB(**params)
    elif etype == 'svm':
        model = SVC(**params)  
    elif etype == 'ad':
        model = AdaBoostClassifier(**params)
    elif etype == 'et':
        model = ExtraTreesClassifier(**params)
    elif etype == 'rf':
        model = RandomForestClassifier(**params)
    elif etype == 'sgd':
        model = SGDClassifier(**params)
    
    return model  

def grid_search(x, y, params, model, folds):
    """ Returns best parameters using Grid Search. """
    gs = GridSearchCV(model, param_grid=params, cv=folds, 
                      scoring="roc_auc", verbose=1, n_jobs=-1)
    gs.fit(x, y)
    return gs.best_score_, gs.best_params_

In [None]:
x_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
x_test = test.drop(['id'], axis=1)
x_train = standardized_features(x_train)
x_test = standardized_features(x_test)

model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1,solver='liblinear')
oof, prediction_lr, scores = train_model(x_train, x_test, y_train, params=None, model_type='sklearn',
                                        folds=folds, n_splits=20, model=model)

In [None]:
params = {'solver':'liblinear', 'max_iter':10000}
lr = get_estimator(params, 'lr')
params = {
    'class_weight': ['balanced', None],
    'penalty': ['l2'],
    'C': [0.001, 0.01, 0.08, 0.1, 0.15, 1, 10, 100]
}
gs_score, gs_params = grid_search(x_train, y_train, params, lr, folds)
lr = get_estimator(gs_params, 'lr')
oof_lr, prediction_lr, scores_lr = train_model(x_train, x_test, y_train, folds=folds, 
                                    params=None, model_type='sklearn',model=lr, n_splits=20)

In [None]:
knn = get_estimator({}, 'knn')
params = {
    'n_neighbors': [2, 3, 5, 10, 20],
    'weights': ['uniform', 'distance'],
    'leaf_size': [5, 10, 30]
}
gs_score, gs_params = grid_search(x_train, y_train, params, knn, folds)
knn = get_estimator(gs_params, 'knn')
oof_knn, predictions_knn, scores_knn = train_model(x_train, x_test, y_train, folds=folds,
                                                  params=None, n_splits=20, model=knn, model_type='sklearn')


In [None]:
gb = get_estimator({}, 'gb')
oof_gb, predictions_gb, scores_gb = train_model(x_train, x_test, y_train, folds=folds,
                                                  params=None, n_splits=20, model=gb, model_type='sklearn')

In [None]:

params = {'probability':True, 'gamma':'scale'}
svm = get_estimator(params, 'svm')
params = {'C': [0.001, 0.01, 0.1, 1.0, 10.0],
        'kernel': ['linear', 'poly', 'rbf'],
         }
gs_score, gs_params = grid_search(x_train, y_train, params, svm, folds)
gs_params['probability'] = True
svm = get_estimator(gs_params, 'svm')

In [None]:
oof_svm, predictions_svm, scores_svm = train_model(x_train, x_test, y_train, folds=folds, 
                                                  params=None, n_splits=20, model=svm, model_type='sklearn')

In [None]:
ad = get_estimator({}, 'ad')
params = {
    'n_estimators': [5, 10, 20, 50, 100],
    'learning_rate': [0.001, 0.01, 0.1, 1, 10]
}
gs_score, gs_params = grid_search(x_train, y_train, params, ad, folds)
ad = get_estimator(gs_params, 'ad')
oof_ad, predictions_ad, scores_ad = train_model(x_train, x_test, y_train, folds=folds, 
                                    params=None, n_splits=20, model=ad, model_type='sklearn')

In [None]:
et = get_estimator({}, 'et')
params = {
    'n_estimators': [10, 50, 100, 1000],
    'max_depth': [None, 3, 5, 15]
}
gs_score, gs_params = grid_search(x_train, y_train, params, et, folds)
et = get_estimator(gs_params, 'et')
oof_et, predictions_et, scores_et = train_model(x_train, x_test, y_train, folds=folds, 
                                                  params=None, n_splits=20, model=et, model_type='sklearn')


In [None]:
rf = get_estimator({}, 'rf')
params = {
    'n_estimators': [10, 50, 100, 1000],
    'max_depth': [None, 3, 5, 15]
}
gs_score, gs_params = grid_search(x_train, y_train, params, rf, folds)
rf = get_estimator(gs_params, 'rf')
oof_rf, predictions_rf, scores_rf = train_model(x_train, x_test, y_train, folds=folds, 
                                                  params=None, n_splits=20, model=rf, model_type='sklearn')

In [None]:
sgd = SGDClassifier(eta0=1, max_iter=1000, tol=0.0001)
params = {'loss': ['log', 'modified_huber'],
          'penalty': ['l1', 'l2', 'elasticnet'],
          'alpha': [0.001, 0.01],
          'l1_ratio': [0, 0.15, 0.5, 1.0],
          'learning_rate': ['optimal', 'invscaling', 'adaptive']
        }
gs_score, gs_params = grid_search(x_train, y_train, params, sgd, folds)
gs_params['eta0'] = 1
gs_params['max_iter'] = 1000
gs_params['tol'] = 0.0001
sgd = get_estimator(gs_params, 'sgd')
oof_rf, predictions_rf, scores_rf = train_model(x_train, x_test, y_train, folds=folds, 
                                                  params=None, n_splits=20, model=sgd, model_type='sklearn')

In [None]:
selector = SelectKBest(f_classif, k=15)
X_trainK = selector.fit_transform(x_train, y_train.values.astype(int))
X_testK = selector.transform(x_test)
oof_lr_1, prediction_lr_1, scores = train_model(X_trainK, X_testK, y_train, params=None, model_type='sklearn',
                                                model=model, n_splits=20, folds=folds)

In [None]:
oof_glm, prediction_glm, scores = train_model(X_trainK, X_testK, y_train, folds=folds, 
                                              params=None, model_type='glm', n_splits=20)


# **Feature Engineering**

In [None]:
x_train = train.drop(['id', 'target'], axis=1)
y_train = train['target']
x_test = test.drop(['id'], axis=1)

#folds = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
folds = RepeatedStratifiedKFold(n_splits=20, n_repeats=20, random_state=42)
x_train = standardized_features(x_train)
x_test = standardized_features(x_test)

model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1,solver='liblinear')

In [None]:
poly = PolynomialFeatures(2)
x_train_poly = poly.fit_transform(x_train) 
x_test_poly = poly.fit_transform(x_test)


In [None]:
cor_poly = pd.DataFrame(x_train_poly).corrwith(y_train)

In [None]:
corr_scores = []
for i in range(10, 510, 5):
    print(i)
    top_corr_cols = list(cor_poly.abs().sort_values().tail(i).reset_index()['index'].values)
    x_train_tmp = x_train_poly[:, top_corr_cols]
    x_test_tmp = x_test_poly[:, top_corr_cols]
    oof_poly, prediction_poly, scores = train_model(x_train_tmp, x_test_tmp, y_train,folds=folds,
                                                   params=None, model_type='sklearn', model=model, n_splits=20)
    corr_scores.append(scores)
    

In [None]:
data = [go.Scatter(
        x = list(range(10, 510, 5)),
        y = [np.round(np.mean(i), 4) for i in corr_scores],
        name = 'CV Scores'
)]
layout = go.Layout(dict(title= 'Top N poly features vs CV',
                    xaxis= dict(title= 'Top N features'),
                    yaxis= dict(title= 'CV Score')
                       ))
py.iplot(dict=(data=data, layout=layout), filename='basic-line')


In [None]:
neigh = NearestNeighbors(5, n_jobs=-1)
neigh.fit(x_train)

dists, _ = neigh.kneighbors(x_train, n_neighbors=5)
mean_dist = dists.mean(axis=1).reshape(-1, 1)
max_dist = dists.max(axis=1).reshape(-1, 1)
min_dist = dists.min(axis=1).reshape(-1, 1)
x_train = np.hstack((x_train, mean_dist, max_dist,
                   min_dist, x_train.std(1).reshape(-1, 1)))

test_dists, _ = neigh.kneighbors(x_test, n_neighbors=5)
test_mean_dist = test_dists.mean(axis=1).reshape(-1, 1)
test_max_dist = test_dists.max(axis=1).reshape(-1, 1)
test_min_dist = test_dists.min(axis=1).reshape(-1, 1)
x_test = np.hstack((x_test, test_mean_dist, test_max_dist,
                   test_min_dist, x_test.std(1).reshape(-1, 1)))

x_train = standardized_features(x_train)
x_test = standardized_features(x_test)

model = LogisticRegression(class_weight='balanced', penalty='l1', C=0.1, solver='liblinear')
oof, prediction_lr, scores = train_model(x_train, x_test, y_train, params=None, model_type='sklearn',
                                        folds=folds, n_splits=20, model=model)

In [None]:
# np.hstack((x_train, mean_dist, max_dist,
#                    min_dist)).shape
x_train.shape