In [97]:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from sklearn.model_selection import StratifiedShuffleSplit 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
import os

In [98]:
train = pd.read_csv("data/train.csv")
print("Training set has {0[0]} rows and {0[1]} columns".format(train.shape))


print(train.head())


Training set has 61878 rows and 95 columns
   id  feat_1  feat_2  feat_3  feat_4  feat_5  feat_6  feat_7  feat_8  feat_9  \
0   1       1       0       0       0       0       0       0       0       0   
1   2       0       0       0       0       0       0       0       1       0   
2   3       0       0       0       0       0       0       0       1       0   
3   4       1       0       0       1       6       1       5       0       0   
4   5       0       0       0       0       0       0       0       0       0   

   ...  feat_85  feat_86  feat_87  feat_88  feat_89  feat_90  feat_91  \
0  ...        1        0        0        0        0        0        0   
1  ...        0        0        0        0        0        0        0   
2  ...        0        0        0        0        0        0        0   
3  ...        0        1        2        0        0        0        0   
4  ...        1        0        0        0        0        1        0   

   feat_92  feat_93   target  


In [99]:
y = train['target'].values
X = train.drop(['target','id'], axis=1).values




In [100]:
type(X)

numpy.ndarray

In [102]:
print(len(X),len(y))

61878 61878


In [103]:
### we need a test set that we didn't train on to find the best weights for combining the classifiers
sss = StratifiedShuffleSplit(n_splits=3, test_size=0.5, random_state=0)
sss.get_n_splits(X, y)



3

numpy.ndarray

In [105]:
print(sss)       
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)

X_train, X_test = X[train_index], X[test_index]
y_train, y_test = y[train_index], y[test_index]

StratifiedShuffleSplit(n_splits=3, random_state=0, test_size=0.5,
            train_size=None)
TRAIN: [16365 12045 51942 ... 41566 18861 53743] TEST: [26592   605  6645 ... 49392 48873 28562]
TRAIN: [18431 31941 15809 ...  6861  1398 29000] TEST: [  985 46992 33850 ... 43781 13446  1872]
TRAIN: [ 8488 24999  4214 ... 19774 20536 38938] TEST: [40170 12652 28899 ... 20065 20650 44413]


In [107]:
X_train.shape, X_test.shape,y_train.shape, y_test.shape

((30939, 93), (30939, 93), (30939,), (30939,))

In [108]:
### building the classifiers
clfs = []

rfc1 = RandomForestClassifier(n_estimators=50, random_state=4141, n_jobs=-1)
rfc1.fit(X_train, y_train)
print('RFC LogLoss {score}'.format(score=log_loss(y_test, rfc.predict_proba(X_test))))
clfs.append(rfc1)


RFC LogLoss 0.4394301388153072


In [109]:
### usually you'd use xgboost and neural nets here

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
print('LogisticRegression LogLoss {score}'.format(score=log_loss(y_test, logreg.predict_proba(X_test))))
clfs.append(logreg)

rfc2 = RandomForestClassifier(n_estimators=50, random_state=1337, n_jobs=-1)
rfc2.fit(X_train, y_train)
print('RFC2 LogLoss {score}'.format(score=log_loss(y_test, rfc2.predict_proba(X_test))))
clfs.append(rfc2)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression LogLoss 0.6455660580111905
RFC2 LogLoss 0.7210416613009594


In [110]:
### finding the optimum weights

predictions = []
for clf in clfs:
    predictions.append(clf.predict_proba(X_test))

def log_loss_func(weights):
    ''' scipy minimize will pass the weights as a numpy array '''
    final_prediction = 0
    for weight, prediction in zip(weights, predictions):
            final_prediction += weight*prediction

    return log_loss(y_test, final_prediction)
    


In [111]:
#the algorithms need a starting value, right not we chose 0.5 for all weights
#its better to choose many random starting points and run minimize a few times
starting_values = [0.5]*len(predictions)
print('Starting values : {starting_values}', starting_values)

#adding constraints  and a different solver as suggested by user 16universe
#https://kaggle2.blob.core.windows.net/forum-message-attachments/75655/2393/otto%20model%20weights.pdf?sv=2012-02-12&se=2015-05-03T21%3A22%3A17Z&sr=b&sp=r&sig=rkeA7EJC%2BiQ%2FJ%2BcMpcA4lYQLFh6ubNqs2XAkGtFsAv0%3D
cons = ({'type':'eq','fun':lambda w: 1-sum(w)})
print('Constraints: {cons}',cons)
#our weights are bound between 0 and 1
bounds = [(0,1)]*len(predictions)
print('Bounds: {bounds}',bounds)
#Minimize a scalar function of one or more variables using Sequential Least Squares Programming (SLSQP)

res = minimize(log_loss_func, starting_values, method='SLSQP', bounds=bounds, constraints=cons)

print('Ensamble Score: {best_score}'.format(best_score=res['fun']))
print('Best Weights: {weights}'.format(weights=res['x']))

Starting values : {starting_values} [0.5, 0.5, 0.5]
Constraints: {cons} {'type': 'eq', 'fun': <function <lambda> at 0x19D1F6F0>}
Bounds: {bounds} [(0, 1), (0, 1), (0, 1)]
Ensamble Score: 0.5796827721549798
Best Weights: [0.24765056 0.50836595 0.24398349]
