In [1]:
#Import basic necessary datasets
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell #To print multiple outputs
InteractiveShell.ast_node_interactivity = 'all'

In [2]:
train_merged = pd.read_csv('train_merged.csv')
test_merged = pd.read_csv('test_merged.csv')
sample = pd.read_csv('sample_submission.csv')

In [3]:
feature_cols = train_merged.columns.drop(['id', 'send_date','email_body', 'subject', 'email_url','is_open','is_click'])
feature_cols

Index(['user_id', 'campaign_id', 'communication_type', 'total_links',
       'no_of_internal_links', 'no_of_images', 'no_of_sections', 'day_of_week',
       'hour', 'day', 'month', 'IsWeekend'],
      dtype='object')

In [7]:
df_majority = train_merged[train_merged.is_click==0]
df_minority = train_merged[train_merged.is_click==1]

In [4]:
# Separate input features (X) and target variable (y)
X = train_merged[feature_cols]
y = train_merged.is_click

In [12]:
x_train, x_test, y_train, y_test = train_test_split(X, y)
x_train.shape
y_train.shape
x_test.shape
y_test.shape

(767393, 12)

(767393,)

(255798, 12)

(255798,)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

# Train model
logreg1 = LogisticRegression().fit(X, y)
 
# Predict on training set
x_pred = logreg1.predict(X)
 
# Is our model still predicting just one class?
np.unique(x_pred)
 
# How's our accuracy?
metrics.accuracy_score(y, x_pred)

array([0], dtype=int64)

0.9875077087269142

In [29]:
y_pred_prob = logreg1.predict_proba(X)
y_pred_prob[:,1]
metrics.roc_auc_score(y, y_pred_prob[:,1])

array([0.01200864, 0.02631284, 0.00598568, ..., 0.01956273, 0.00894045,
       0.01713344])

0.4911967732260011

In [40]:
train_merged.is_click.value_counts()

0    1010409
1      12782
Name: is_click, dtype: int64

In [41]:
from sklearn.utils import resample
# Separate majority and minority classes
df_majority = train_merged[train_merged.is_click==0]
df_minority = train_merged[train_merged.is_click==1]
 
# Upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=710409,    # to match majority class
                                 random_state=123) # reproducible results
 
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_majority, df_minority_upsampled])
 
# Display new class counts
df_upsampled.is_click.value_counts()

0    1010409
1     710409
Name: is_click, dtype: int64

In [42]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
# Separate input features (X) and target variable (y)
y = df_upsampled.is_click
X = df_upsampled[feature_cols]

#train-test split
x_train, x_test, y_train, y_test = train_test_split(X, y)

# Train model
logreg2 = LogisticRegression().fit(x_train, y_train)

#ROC-AUC
y_pred_prob = logreg2.predict_proba(x_test)
metrics.roc_auc_score(y_test, y_pred_prob[:,1])

0.5046740347702273

In [26]:
sub_pred = logreg2.predict_proba(test_merged[feature_cols])
sample['is_click'] = sub_pred
sample.head()
sample.to_csv('sample_submission.csv', index=False)

Unnamed: 0,id,is_click
0,63_122715,0.568788
1,56_76206,0.490403
2,57_96189,0.458042
3,56_166917,0.494294
4,56_172838,0.494548


In [27]:
from sklearn.ensemble import RandomForestClassifier

# Train model
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

#Accuracy

y_pred = rfc.predict(x_test)
metrics.accuracy_score(y_test, y_pred)

# Predict on training set
y_pred_prob = rfc.predict_proba(x_test)
 
# How's our accuracy?
metrics.roc_auc_score(y_test, y_pred_prob[:,1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

0.9905464118526143

0.991492537904436

In [28]:
sub_pred = rfc.predict_proba(test_merged[feature_cols])
sample['is_click'] = sub_pred
sample.head()
sample.to_csv('sample_submission.csv', index=False)

Unnamed: 0,id,is_click
0,63_122715,0.9
1,56_76206,1.0
2,57_96189,1.0
3,56_166917,1.0
4,56_172838,1.0


In [9]:
# Downsample majority class
from sklearn.utils import resample
df_majority_downsampled = resample(df_majority, 
                                 replace=False,    # sample without replacement
                                 n_samples=12782,     # to match minority class
                                 random_state=123) # reproducible results
 
# Combine minority class with downsampled majority class
df_downsampled = pd.concat([df_majority_downsampled, df_minority])
 
# Display new class counts
df_downsampled.is_click.value_counts()

1    12782
0    12782
Name: is_click, dtype: int64

In [38]:
#train-test split
y = df_downsampled.is_click
X = df_downsampled[feature_cols]

In [60]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaled = scaler.fit_transform(X)
x_scaled = pd.DataFrame(columns=X.columns, data=scaled)
x_scaled.head()

Unnamed: 0,user_id,campaign_id,communication_type,total_links,no_of_internal_links,no_of_images,no_of_sections,day_of_week,hour,day,month,IsWeekend
0,-0.607569,-0.031347,0.320859,0.438439,0.372118,0.634016,0.801996,-1.707826,0.94978,-0.561829,-1.095316,0.514824
1,-1.43855,-0.702305,-1.392145,-1.13319,-1.089942,-1.603898,-0.97905,-0.708127,0.188296,-0.71505,-0.430675,0.514824
2,0.641776,0.751438,-1.392145,1.088045,1.189152,1.244356,-0.97905,-0.208277,-0.31936,2.349365,1.563247,0.514824
3,-1.7181,-1.037785,0.89186,-1.25892,-1.261949,-1.807344,-0.97905,-1.707826,-0.31936,1.736482,0.898607,0.514824
4,-1.57774,1.198744,-1.392145,0.77372,0.823637,0.634016,-0.97905,1.291272,1.711264,-0.255388,0.566286,-1.942412


In [39]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y)

In [62]:
from sklearn.model_selection import cross_val_score

# Train model
logreg3 = LogisticRegression()

#cross-validation
lrscore = cross_val_score(logreg3, x_train, y_train, cv=10, scoring='accuracy')
lrscore.mean()

# Predict on training set
logreg3.fit(x_train, y_train)
y_pred_prob = logreg3.predict_proba(x_test)
metrics.roc_auc_score(y_test, y_pred_prob[:,1])

0.5456103231012018

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

0.5754871716524761

In [63]:
test_scaled = pd.DataFrame(columns=feature_cols, data=scaler.fit_transform(test_merged[feature_cols]))
sub_pred = logreg3.predict_proba(test_scaled)
sample['is_click'] = sub_pred
sample.head()
sample.to_csv('sample_submission.csv', index=False)

Unnamed: 0,id,is_click
0,63_122715,0.665816
1,56_76206,0.647458
2,57_96189,0.403236
3,56_166917,0.644889
4,56_172838,0.644721


In [14]:
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
bbc = BalancedBaggingClassifier(base_estimator=LogisticRegression(),
                                ratio='majority',
                                replacement=True,
                                random_state=0)
bbc_score = cross_val_score(bbc, x_train, y_train, cv=10, scoring='accuracy')
bbc_score.mean()
bbc.fit(x_train, y_train)
y_pred_prob = bbc.predict_proba(x_test)
metrics.roc_auc_score(y_test,y_pred_prob[:,1])

0.5358584614759024

BalancedBaggingClassifier(base_estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
             bootstrap=True, bootstrap_features=False, max_features=1.0,
             max_samples=1.0, n_estimators=10, n_jobs=1, oob_score=False,
             random_state=0, ratio='majority', replacement=True, verbose=0,
             warm_start=False)

0.5648524477750839

In [43]:
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier()
abc_score = cross_val_score(abc, x_train, y_train, cv=10, scoring='accuracy')
abc_score.mean()
abc.fit(x_train, y_train)
y_pred_prob = abc.predict_proba(x_test)
metrics.roc_auc_score(y_test,y_pred_prob[:,1])

0.5589117866088679

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=50, random_state=None)

0.5759711141476029

In [46]:
from sklearn.ensemble import ExtraTreesClassifier
etc = ExtraTreesClassifier()
etc_score = cross_val_score(etc, x_train, y_train, cv=10, scoring='accuracy')
etc_score.mean()
etc.fit(x_train, y_train)
y_pred_prob = etc.predict_proba(x_test)
metrics.roc_auc_score(y_test,y_pred_prob[:,1])

0.5270441800833721

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

0.5332121675163946

In [49]:
from sklearn.model_selection import GridSearchCV
etc = ExtraTreesClassifier()
criteria = ['entropy','gini']
#state = np.array(list(range(25)))
depth = np.arange(1,11)
params = {'criterion':criteria, 'max_depth':depth}
grid = GridSearchCV(etc, params, scoring='roc_auc', refit=True)
grid.fit(x_train,y_train)
print('Best roc_auc: {:.4}, with best{}'.format(grid.best_score_, grid.best_params_))
etc_pred_prob = grid.predict_proba(x_test)
metrics.roc_auc_score(y_test,etc_pred_prob[:,1])

GridSearchCV(cv=None, error_score='raise',
       estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=None, max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'criterion': ['entropy', 'gini'], 'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

Best roc_auc: 0.5905, with best{'criterion': 'gini', 'max_depth': 6}


0.584867594265928

In [50]:
sub_pred = grid.predict_proba(test_merged[feature_cols])
sample['is_click'] = sub_pred
sample.head()
sample.to_csv('sample_submission.csv', index=False)

Unnamed: 0,id,is_click
0,63_122715,0.516037
1,56_76206,0.535007
2,57_96189,0.481492
3,56_166917,0.527826
4,56_172838,0.525384


In [37]:
from sklearn.model_selection import GridSearchCV
c_values = [0.1,0.2, 0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
state = np.array(list(range(25)))
params = {'C':c_values, 'random_state':state}
grid = GridSearchCV(logreg3, params, scoring='roc_auc', refit=True)
grid.fit(x_train,y_train)
print('Best log_loss: {:.4}, with best{}'.format(grid.best_score_, grid.best_params_))
rfc_pred = grid.predict_proba(x_test)
metrics.roc_auc_score(y_test, rfc_pred[:,1])

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=0.2, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0], 'random_state': array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

Best log_loss: 0.5538, with best{'C': 0.3, 'random_state': 0}


0.5543485012820399

In [33]:
sub_pred = logreg3.predict_proba(test_merged[feature_cols])
sample['is_click'] = sub_pred
sample.head()
sample.to_csv('sample_submission.csv', index=False)

Unnamed: 0,id,is_click
0,63_122715,0.485816
1,56_76206,0.477598
2,57_96189,0.288331
3,56_166917,0.479273
4,56_172838,0.479382


In [50]:
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
dtc_score = cross_val_score(dtc, x_train, y_train, cv=10, scoring='accuracy')
dtc_score.mean()
dtc.fit(x_train,y_train)
y_pred_prob = dtc.predict_proba(x_test)
metrics.roc_auc_score(y_test, y_pred_prob[:,1])

0.5231845533328846

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

0.53075325940568

In [52]:
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier()
knc_score = cross_val_score(knc, x_train, y_train, cv=10, scoring='accuracy')
knc_score.mean()
knc.fit(x_train,y_train)
y_pred_prob = knc.predict_proba(x_test)
metrics.roc_auc_score(y_test, y_pred_prob[:,1])

0.5146836422608636

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

0.5445910925639091

In [56]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb_score = cross_val_score(mnb, x_train, y_train, cv=10, scoring='accuracy')
mnb_score.mean()
mnb.fit(x_train,y_train)
y_pred_prob = mnb.predict_proba(x_test)
metrics.roc_auc_score(y_test, y_pred_prob[:,1])

0.5015389176557442

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

0.5240942569745914

In [40]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc_score = cross_val_score(gbc, x_train, y_train, cv=10, scoring='accuracy')
gbc_score.mean()
gbc.fit(x_train, y_train)
gbc_pred_prob = gbc.predict_proba(x_test)
metrics.roc_auc_score(y_test, gbc_pred_prob[:,1])

0.5553129288862861

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=None, subsample=1.0, verbose=0,
              warm_start=False)

0.5796659859048011

In [52]:
from sklearn.model_selection import GridSearchCV
gbc = GradientBoostingClassifier()
lr = [0.01,0.02, 0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1]
depth = np.array(np.arange(1,11))
estimators = [50, 100]
params = {'learning_rate':lr, 'max_depth':depth, 'n_estimators':estimators}
grid = GridSearchCV(gbc, params, scoring='roc_auc', refit=True)
grid.fit(x_train,y_train)
print('Best log_loss: {:.4}, with best{}'.format(grid.best_score_, grid.best_params_))
gbc_pred = grid.predict_proba(x_test)
metrics.roc_auc_score(y_test, gbc_pred[:,1])

KeyboardInterrupt: 

In [41]:
test_pred = gbc.predict_proba(test_merged[feature_cols])
sample['is_click'] = test_pred
sample.head()
sample.to_csv('sample_submission.csv', index=False)

Unnamed: 0,id,is_click
0,63_122715,0.537394
1,56_76206,0.556923
2,57_96189,0.497147
3,56_166917,0.508117
4,56_172838,0.508117


In [15]:
# Some useful parameters which will come in handy later on
from sklearn.cross_validation import KFold;
ntrain = x_train.shape[0]
ntest = x_test.shape[0]
SEED = 0 # for reproducibility
NFOLDS = 5 # set folds for out-of-fold prediction
kf = KFold(ntrain, n_folds= NFOLDS, random_state=SEED)

# Class to extend the Sklearn classifier
class SklearnHelper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)
    
    def fit(self,x,y):
        return self.clf.fit(x,y)
    
    def feature_importances(self,x,y):
        print(self.clf.fit(x,y).feature_importances_)
    
# Class to extend XGboost classifer



In [16]:
def get_oof(clf, x_train, y_train, x_test):
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))

    for i, (train_index, test_index) in enumerate(kf):
        x_tr = x_train[train_index]
        y_tr = y_train[train_index]
        x_te = x_train[test_index]

        clf.train(x_tr, y_tr)

        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)

    oof_test[:] = oof_test_skf.mean(axis=0)
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)

In [17]:
# Put in our parameters for said classifiers
# Random Forest parameters
rf_params = {
    'n_jobs': -1,
    'n_estimators': 500,
     'warm_start': True, 
     #'max_features': 0.2,
    'max_depth': 6,
    'min_samples_leaf': 2,
    'max_features' : 'sqrt',
    'verbose': 0
}

# Extra Trees Parameters
et_params = {
    'n_jobs': -1,
    'n_estimators':500,
    #'max_features': 0.5,
    'max_depth': 8,
    'min_samples_leaf': 2,
    'verbose': 0
}

# AdaBoost parameters
ada_params = {
    'n_estimators': 500,
    'learning_rate' : 0.75
}

# Gradient Boosting parameters
gb_params = {
    'n_estimators': 500,
     #'max_features': 0.2,
    'max_depth': 5,
    'min_samples_leaf': 2,
    'verbose': 0
}

# Support Vector Classifier parameters 
svc_params = {
    'kernel' : 'linear',
    'C' : 0.025
    }

In [18]:
# Create 5 objects that represent our 4 models
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.svm import SVC

rf = SklearnHelper(clf=RandomForestClassifier, seed=SEED, params=rf_params)
et = SklearnHelper(clf=ExtraTreesClassifier, seed=SEED, params=et_params)
ada = SklearnHelper(clf=AdaBoostClassifier, seed=SEED, params=ada_params)
gb = SklearnHelper(clf=GradientBoostingClassifier, seed=SEED, params=gb_params)

In [19]:
# Create Numpy arrays of train, test and target ( Survived) dataframes to feed into our models
x_train = x_train.values.astype(np.float) # Creates an array of the train data
x_test = x_test.values.astype(np.float) # Creats an array of the test data
y_train = y_train.values.astype(np.float)

In [20]:
# Create our OOF train and test predictions. These base results will be used as new features
et_oof_train, et_oof_test = get_oof(et, x_train, y_train, x_test) # Extra Trees
rf_oof_train, rf_oof_test = get_oof(rf,x_train, y_train, x_test) # Random Forest
ada_oof_train, ada_oof_test = get_oof(ada, x_train, y_train, x_test) # AdaBoost 
gb_oof_train, gb_oof_test = get_oof(gb,x_train, y_train, x_test) # Gradient Boost

print("Training is complete")

  warn("Warm-start fitting without increasing n_estimators does not "


Training is complete


In [21]:
rf_feature = rf.feature_importances(x_train,y_train)
et_feature = et.feature_importances(x_train, y_train)
ada_feature = ada.feature_importances(x_train, y_train)
gb_feature = gb.feature_importances(x_train,y_train)

  warn("Warm-start fitting without increasing n_estimators does not "


[0.24083032 0.08614631 0.07502916 0.11015213 0.11952176 0.06248752
 0.00963149 0.03509298 0.05818505 0.07539816 0.119537   0.00798812]
[0.16978194 0.09227231 0.11479066 0.06506903 0.07865277 0.07476457
 0.02199358 0.05334768 0.07379205 0.07165786 0.15987091 0.02400664]
[0.802 0.028 0.01  0.012 0.02  0.02  0.002 0.004 0.028 0.018 0.056 0.   ]
[0.73794566 0.04194542 0.01698657 0.01778552 0.01669469 0.02736902
 0.00934063 0.0299195  0.03313029 0.03501887 0.03065963 0.00320422]


In [24]:
rf_features=[0.24083032, 0.08614631, 0.07502916, 0.11015213, 0.11952176, 0.06248752, 0.00963149, 0.03509298, 0.05818505, 0.07539816, 0.119537, 0.00798812]
et_features=[0.16978194, 0.09227231, 0.11479066, 0.06506903, 0.07865277, 0.07476457, 0.02199358, 0.05334768, 0.07379205, 0.07165786, 0.15987091, 0.02400664]
ada_features=[0.802, 0.028, 0.01,  0.012, 0.02,  0.02,  0.002, 0.004, 0.028, 0.018, 0.056, 0.]
gb_features=[0.73794566, 0.04194542, 0.01698657, 0.01778552, 0.01669469, 0.02736902, 0.00934063, 0.0299195, 0.03313029, 0.03501887, 0.03065963, 0.00320422]

In [27]:
cols = X.columns.values
# Create a dataframe with features
feature_dataframe = pd.DataFrame( {'features': cols,
     'Random Forest feature importances': rf_features,
     'Extra Trees  feature importances': et_features,
      'AdaBoost feature importances': ada_features,
    'Gradient Boost feature importances': gb_features
    })

In [28]:
# Create the new column containing the average of values

feature_dataframe['mean'] = feature_dataframe.mean(axis= 1) # axis = 1 computes the mean row-wise
feature_dataframe

Unnamed: 0,AdaBoost feature importances,Extra Trees feature importances,Gradient Boost feature importances,Random Forest feature importances,features,mean
0,0.802,0.169782,0.737946,0.24083,user_id,0.487639
1,0.028,0.092272,0.041945,0.086146,campaign_id,0.062091
2,0.01,0.114791,0.016987,0.075029,communication_type,0.054202
3,0.012,0.065069,0.017786,0.110152,total_links,0.051252
4,0.02,0.078653,0.016695,0.119522,no_of_internal_links,0.058717
5,0.02,0.074765,0.027369,0.062488,no_of_images,0.046155
6,0.002,0.021994,0.009341,0.009631,no_of_sections,0.010741
7,0.004,0.053348,0.02992,0.035093,day_of_week,0.03059
8,0.028,0.073792,0.03313,0.058185,hour,0.048277
9,0.018,0.071658,0.035019,0.075398,day,0.050019


In [29]:
base_predictions_train = pd.DataFrame( {'RandomForest': rf_oof_train.ravel(),
     'ExtraTrees': et_oof_train.ravel(),
     'AdaBoost': ada_oof_train.ravel(),
      'GradientBoost': gb_oof_train.ravel()
    })
base_predictions_train.shape
base_predictions_train.head()

(19173, 4)

Unnamed: 0,AdaBoost,ExtraTrees,GradientBoost,RandomForest
0,0.0,0.0,0.0,0.0
1,1.0,1.0,1.0,1.0
2,1.0,1.0,1.0,1.0
3,1.0,1.0,0.0,1.0
4,0.0,0.0,1.0,0.0


In [30]:
x_train = np.concatenate(( et_oof_train, rf_oof_train, ada_oof_train, gb_oof_train), axis=1)
x_test = np.concatenate(( et_oof_test, rf_oof_test, ada_oof_test, gb_oof_test), axis=1)

In [31]:
import xgboost as xgb
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
 n_estimators= 1000,
 max_depth= 4,
 min_child_weight= 2,
 #gamma=1,
 gamma=0.9,                        
 subsample=0.8,
 colsample_bytree=0.8,
 objective= 'binary:logistic',
 nthread= -1,
 scale_pos_weight=1).fit(x_train, y_train)
predictions = gbm.predict_proba(x_test)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
sample['is_click'] = predictions
sample.head()
sample.to_csv('sample_submission.csv', index=False)

In [34]:
from sklearn.ensemble import RandomForestClassifier

# Train model
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)

#Accuracy

y_pred = rfc.predict(x_test)
metrics.accuracy_score(y_test, y_pred)

# Predict on training set
y_pred_prob = rfc.predict_proba(x_test)
 
# How's our accuracy?
metrics.roc_auc_score(y_test, y_pred_prob[:,1])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

0.5310593021436395

0.5467898378441058

In [None]:
from sklearn.svm import SVC

# Separate input features (X) and target variable (y)
y = train_merged.is_click
X = train_merged[feature_cols]
 
# Train model
svc = SVC(kernel='linear', 
            class_weight='balanced', # penalize
            probability=True)
 
svc.fit(X, y)
 
# Predict on training set
y_pred = svc.predict(X)
 
# Is our model still predicting just one class?
np.unique(y_pred)
 
# How's our accuracy?
metrics.accuracy_score(y, y_pred)
 
# What about AUROC?
y_pred_prob = svc.predict_proba(X)
metrics.roc_auc_score(y, y_pred_prob)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.model_selection import train_test_split

# Separate input features (X) and target variable (y)
y = train_merged.is_click
X = train_merged[feature_cols]

# Train model
#rfc = RandomForestClassifier()
#rfc.fit(x_train, y_train)

#Accuracy

y_train_pred = rfc.predict(x_train)
y_test_pred = rfc.predict(x_test)
metrics.accuracy_score(y_train,y_train_pred)
metrics.accuracy_score(y_test, y_test_pred)

# Predict on training set
y_train_pred_prob = rfc.predict_proba(x_train)
y_test_pred_prob = rfc.predict_proba(x_test)
 
# How's our accuracy?
metrics.roc_auc_score(y_train, y_train_pred_prob[:,1])
metrics.roc_auc_score(y_test, y_test_pred_prob[:,1])

0.9962965520926045

0.9784243817387157

0.9995802532633448

0.5058981934968262

In [6]:
np.unique(y_pred_prob)

array([0. , 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])