In [1]:
import pandas as pd, numpy as np, time
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
from sklearn import metrics
from collections import Counter
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
def auc(model, train, test): 
    return (metrics.roc_auc_score(y_train, model.predict_proba(train)[:,1]),
            metrics.roc_auc_score(y_test, model.predict_proba(test)[:,1]))

In [3]:
train=pd.read_csv('X_train.csv')
test=pd.read_csv('X_test.csv')
y_train=np.array(pd.read_csv('y_train.csv'))
y_test=np.array(pd.read_csv('y_test.csv'))

### XGBoost

In [4]:
model = xgb.XGBClassifier()
model.fit(train, np.array(y_train))

auc(model, train, test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(0.7037043613821493, 0.6957299448706683)

### LightGBM

In [5]:
model2 = lgb.LGBMClassifier()
model2.fit(train, y_train)

auc(model2, train, test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


(0.7556001645011479, 0.7227598890343905)

### Catboost

In [15]:
clf = cb.CatBoostClassifier(iterations=30, verbose=None)
clf.fit(train, y_train[:,0])

auc(clf, train, test)

0:	learn: 0.6804429	total: 99.2ms	remaining: 2.88s
1:	learn: 0.6686114	total: 138ms	remaining: 1.93s
2:	learn: 0.6575421	total: 176ms	remaining: 1.58s
3:	learn: 0.6472172	total: 217ms	remaining: 1.41s
4:	learn: 0.6376007	total: 256ms	remaining: 1.28s
5:	learn: 0.6286250	total: 296ms	remaining: 1.18s
6:	learn: 0.6202610	total: 335ms	remaining: 1.1s
7:	learn: 0.6123011	total: 375ms	remaining: 1.03s
8:	learn: 0.6049711	total: 418ms	remaining: 974ms
9:	learn: 0.5981509	total: 461ms	remaining: 923ms
10:	learn: 0.5918098	total: 503ms	remaining: 870ms
11:	learn: 0.5857917	total: 546ms	remaining: 820ms
12:	learn: 0.5801642	total: 587ms	remaining: 768ms
13:	learn: 0.5748992	total: 624ms	remaining: 713ms
14:	learn: 0.5698774	total: 664ms	remaining: 664ms
15:	learn: 0.5652531	total: 700ms	remaining: 613ms
16:	learn: 0.5608902	total: 739ms	remaining: 565ms
17:	learn: 0.5567509	total: 781ms	remaining: 521ms
18:	learn: 0.5530245	total: 821ms	remaining: 476ms
19:	learn: 0.5495389	total: 859ms	remaini

(0.666772632095846, 0.6643034936867621)

## NEW TASK!
#### slides from Lecture 3 can help you :)

### Average of predictions

In [19]:
### YOUR CODE HERE
def auc_mean(models, train, test):
    mean_pred_train = np.concatenate([model.predict_proba(train)[:,1].reshape(-1,1) for model in models],axis=1).mean(axis=1)
    mean_pred_test = np.concatenate([model.predict_proba(test)[:,1].reshape(-1,1) for model in models],axis=1).mean(axis=1)
    return (metrics.roc_auc_score(y_train, mean_pred_train),
            metrics.roc_auc_score(y_test, mean_pred_test))

auc_mean([model, model2, clf], train, test)

(0.7280287695136911, 0.7089696008955761)

### Weighted average of predictions

In [22]:
from sklearn.linear_model import LogisticRegressionCV

In [23]:
### YOUR CODE HERE
def auc_weighted(models, train, test):
    preds_train = np.concatenate([model.predict_proba(train)[:,1].reshape(-1,1) for model in models],axis=1)
    preds_test = np.concatenate([model.predict_proba(test)[:,1].reshape(-1,1) for model in models],axis=1)
    
    logit = LogisticRegressionCV(cv=5)
    logit.fit(preds_train, y_train)
    
    w_avg_pred_train = logit.predict_proba(preds_train)[:,1]
    w_avg_pred_test = logit.predict_proba(preds_test)[:,1]
    
    return (metrics.roc_auc_score(y_train, w_avg_pred_train),
            metrics.roc_auc_score(y_test, w_avg_pred_test))

auc_weighted([model, model2, clf], train, test)

  y = column_or_1d(y, warn=True)


(0.7686800040215928, 0.7234560605877884)

### Bagging 

In [26]:
from sklearn.ensemble import RandomForestClassifier

In [29]:
### YOUR CODE HERE
def auc_bagged(train, test):
    
    rfc = RandomForestClassifier()
    bags = 10
    seed = 1
    bagged_preds_train = np.zeros(train.shape[0])
    bagged_preds_test = np.zeros(test.shape[0])
    for n in range(bags):
        rfc.set_params(random_state=seed+n)
        rfc.fit(train, y_train)
        preds_train = rfc.predict_proba(train)[:,1]
        preds_test = rfc.predict_proba(test)[:,1]
        bagged_preds_train+=preds_train
        bagged_preds_test+=preds_test
    
    return (metrics.roc_auc_score(y_train, bagged_preds_train/bags),
            metrics.roc_auc_score(y_test, bagged_preds_test/bags))

auc_bagged(train, test)

  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()
  # This is added back by InteractiveShellApp.init_path()


(1.0, 0.7197720597377896)

### Stacking

In [30]:
from sklearn.model_selection import train_test_split

In [36]:
### YOUR CODE HERE

### YOUR CODE HERE
def auc_stacking(models0, train, test):
    
    train1, train2,y_train1, y_train2 = train_test_split(train, y_train, test_size=0.5)
    models = []
    for model in models0:
        try:
            m = model.fit(train1, y_train1.reshape(-1,1))
        except:
            m = model.fit(train1, y_train1[:,0])
        models.append(m)
        
    preds_train1 = np.concatenate([model.predict_proba(train1)[:,1].reshape(-1,1) for model in models],axis=1)
    preds_train2 = np.concatenate([model.predict_proba(train2)[:,1].reshape(-1,1) for model in models],axis=1)
    preds_test = np.concatenate([model.predict_proba(test)[:,1].reshape(-1,1) for model in models],axis=1)
    
    logit = LogisticRegressionCV(cv=5)
    logit.fit(preds_train2, y_train2)
    
    w_avg_pred_train = logit.predict_proba(preds_train2)[:,1]
    w_avg_pred_test = logit.predict_proba(preds_test)[:,1]
    
    return (metrics.roc_auc_score(y_train2, w_avg_pred_train),
            metrics.roc_auc_score(y_test, w_avg_pred_test))

auc_stacking([model, model2, clf], train, test)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


0:	learn: 0.6802321	total: 26.4ms	remaining: 766ms
1:	learn: 0.6681502	total: 53.5ms	remaining: 749ms
2:	learn: 0.6568942	total: 80ms	remaining: 720ms
3:	learn: 0.6464790	total: 107ms	remaining: 692ms
4:	learn: 0.6368768	total: 132ms	remaining: 661ms
5:	learn: 0.6277459	total: 159ms	remaining: 635ms
6:	learn: 0.6192242	total: 185ms	remaining: 608ms
7:	learn: 0.6111550	total: 211ms	remaining: 580ms
8:	learn: 0.6036505	total: 240ms	remaining: 561ms
9:	learn: 0.5967091	total: 269ms	remaining: 538ms
10:	learn: 0.5903241	total: 294ms	remaining: 508ms
11:	learn: 0.5842382	total: 321ms	remaining: 482ms
12:	learn: 0.5783760	total: 348ms	remaining: 455ms
13:	learn: 0.5730202	total: 375ms	remaining: 428ms
14:	learn: 0.5680148	total: 402ms	remaining: 402ms
15:	learn: 0.5632601	total: 429ms	remaining: 375ms
16:	learn: 0.5587985	total: 458ms	remaining: 350ms
17:	learn: 0.5546345	total: 484ms	remaining: 323ms
18:	learn: 0.5506236	total: 511ms	remaining: 296ms
19:	learn: 0.5471718	total: 536ms	remain

  y = column_or_1d(y, warn=True)


(0.717676559782316, 0.7149900112653964)