# !Link to challenge!

# https://www.kaggle.com/t/b3ced76a60b94572a90740756f778fc8

### Metric

For binary classification with a true label y $\in \{0,1\}$ and a probability estimate p = $\operatorname{Pr}(y = 1)$, the log loss per sample is the negative log-likelihood of the classifier given the true label:
$$
L_{\log}(y, p) = -\log \operatorname{Pr}(y|p) = -(y \log (p) + (1 - y) \log (1 - p))$
$$

This extends to the multiclass case as follows. Let the true labels for a set of samples be encoded as a 1-of-K binary indicator matrix Y, i.e., $y_{i,k} = 1$ if sample i has label k taken from a set of K labels. Let P be a matrix of probability estimates, with $p_{i,k} = \operatorname{Pr}(t_{i,k} = 1)$. Then the log loss of the whole set is

$$
L_{\log}(Y, P) = -\log \operatorname{Pr}(Y|P) = - \frac{1}{N} \sum_{i=0}^{N-1} \sum_{k=0}^{K-1} y_{i,k} \log p_{i,k}
$$

# Grading

#### Firstly, to get any mark, you must beat medium baseline score

Your grade after challenge ends will be calculated as this:
$$
Grade = \frac{score - mid\_baseline\_score}{\#1\_score - mid\_baseline\_score} * 10
$$

where score will be taken from private part results.

## About

In this notebook we prepare a simple solution.

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

### Read training and test files

In [4]:
data = pd.read_csv('training.csv')
test = pd.read_csv('test.csv')

In [8]:
data.head()

Unnamed: 0,TrackP,TrackNDoFSubdetector2,BremDLLbeElectron,MuonLooseFlag,FlagSpd,SpdE,EcalDLLbeElectron,DLLmuon,RICHpFlagElectron,EcalDLLbeMuon,...,TrackNDoF,RICHpFlagMuon,RICH_DLLbeKaon,RICH_DLLbeElectron,HcalE,MuonFlag,FlagMuon,PrsE,RICH_DLLbeMuon,RICH_DLLbeProton
0,74791.156263,15.0,0.232275,1.0,1.0,3.2,-2.505719,6.604153,1.0,1.92996,...,28.0,1.0,-7.2133,-0.2802,5586.589846,1.0,1.0,10.422315,-2.081143e-07,-24.8244
1,2738.489989,15.0,-0.357748,0.0,1.0,3.2,1.864351,0.263651,1.0,-2.061959,...,32.0,1.0,-0.324317,1.707283,-7e-06,0.0,1.0,43.334935,2.771583,-0.648017
2,2161.409908,17.0,-999.0,0.0,0.0,-999.0,-999.0,-999.0,0.0,-999.0,...,27.0,0.0,-999.0,-999.0,-999.0,0.0,0.0,-999.0,-999.0,-999.0
3,15277.73049,20.0,-0.638984,0.0,1.0,3.2,-2.533918,-8.724949,1.0,-3.253981,...,36.0,1.0,-35.202221,-14.742319,4482.803707,0.0,1.0,2.194175,-3.070819,-29.291519
4,7563.700195,19.0,-0.638962,0.0,1.0,3.2,-2.087146,-7.060422,1.0,-0.995816,...,33.0,1.0,25.084287,-10.272412,5107.55468,0.0,1.0,1.5e-05,-5.373712,23.653087


In [32]:
data[data.DLLmuon < -998].Label.head()

2         Ghost
111       Ghost
125        Kaon
139      Proton
268    Electron
Name: Label, dtype: object

In [6]:
test.head()

Unnamed: 0,TrackP,TrackNDoFSubdetector2,BremDLLbeElectron,MuonLooseFlag,FlagSpd,SpdE,EcalDLLbeElectron,DLLmuon,RICHpFlagElectron,EcalDLLbeMuon,...,RICHpFlagMuon,RICH_DLLbeKaon,RICH_DLLbeElectron,HcalE,MuonFlag,FlagMuon,PrsE,RICH_DLLbeMuon,RICH_DLLbeProton,ID
0,55086.199233,18.0,-0.438763,0.0,1.0,3.2,-1.843821,-4.579244,1.0,-1.732886,...,1.0,18.674086,-1.355015,24510.990244,0.0,1.0,9.325265,-0.250015,35.408585,0
1,3393.820071,17.0,-0.554341,0.0,1.0,0.0,-0.883237,-6.203035,1.0,-0.097206,...,1.0,16.536804,-17.601196,778.675303,0.0,1.0,-6e-06,-6.646096,14.011904,1
2,18341.359361,12.0,-0.554339,0.0,1.0,0.0,-2.653786,-3.922639,1.0,0.936484,...,1.0,-1.306109,-4.536409,7915.21242,0.0,1.0,1.371346,-2.132609,-5.617409,2
3,27486.710933,7.0,-0.492411,1.0,1.0,3.2,-999.0,2.034453,1.0,-999.0,...,1.0,-4.222793,3.149207,-999.0,1.0,1.0,61.985428,0.946207,-8.657193,3
4,6842.249996,16.0,0.098706,0.0,1.0,3.2,2.644499,-1.471364,1.0,-2.90947,...,1.0,-3.425113,23.147387,-1.3e-05,0.0,1.0,2.468453,2.614987,-5.713513,4


In [72]:
data.shape

(1200000, 50)

In [128]:
ss = data.isin([-999.0])
ss

Unnamed: 0,TrackP,TrackNDoFSubdetector2,BremDLLbeElectron,MuonLooseFlag,FlagSpd,SpdE,EcalDLLbeElectron,DLLmuon,RICHpFlagElectron,EcalDLLbeMuon,...,TrackNDoF,RICHpFlagMuon,RICH_DLLbeKaon,RICH_DLLbeElectron,HcalE,MuonFlag,FlagMuon,PrsE,RICH_DLLbeMuon,RICH_DLLbeProton
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,True,False,False,True,True,True,False,True,...,False,False,True,True,True,False,False,True,True,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,True,False,False,False,True,False,False,True,...,False,False,False,False,True,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [133]:
ss = data.isin([-999.0])
s = ss.apply(pd.Series.value_counts, axis=1)
s[True].value_counts()

2.0     550902
14.0    119868
3.0     114948
4.0      63425
5.0      56974
15.0     34372
8.0      32122
11.0     22141
1.0      15380
24.0     13308
12.0     12034
7.0      10754
6.0       8781
13.0      7101
9.0       6186
10.0      4620
17.0      1003
19.0       480
16.0        89
23.0        83
18.0         3
Name: True, dtype: int64

In [139]:
s[True].value_counts(dropna = False)

 2.0     550902
NaN      125426
 14.0    119868
 3.0     114948
 4.0      63425
 5.0      56974
 15.0     34372
 8.0      32122
 11.0     22141
 1.0      15380
 24.0     13308
 12.0     12034
 7.0      10754
 6.0       8781
 13.0      7101
 9.0       6186
 10.0      4620
 17.0      1003
 19.0       480
 16.0        89
 23.0        83
 18.0         3
Name: True, dtype: int64

In [258]:
data2 = data[(s[True] < 1)  | (s[True].isna())]

In [259]:
data2.shape

(125426, 50)

In [376]:
for i in range(50):
    print(ss.iloc[:, i].value_counts())

False    1200000
Name: TrackP, dtype: int64
False    1200000
Name: TrackNDoFSubdetector2, dtype: int64
False    950439
True     249561
Name: BremDLLbeElectron, dtype: int64
False    1200000
Name: MuonLooseFlag, dtype: int64
False    1200000
Name: FlagSpd, dtype: int64
False    1023833
True      176167
Name: SpdE, dtype: int64
False    981143
True     218857
Name: EcalDLLbeElectron, dtype: int64
False    1186609
True       13391
Name: DLLmuon, dtype: int64
False    1200000
Name: RICHpFlagElectron, dtype: int64
False    981143
True     218857
Name: EcalDLLbeMuon, dtype: int64
False    1200000
Name: TrackQualitySubdetector2, dtype: int64
False    1200000
Name: FlagPrs, dtype: int64
False    1186609
True       13391
Name: DLLelectron, dtype: int64
False    1186609
True       13391
Name: DLLkaon, dtype: int64
False    981143
True     218857
Name: EcalE, dtype: int64
False    1200000
Name: TrackQualityPerNDoF, dtype: int64
False    1186609
True       13391
Name: DLLproton, dtype: int64
False

In [111]:
s[-999].value_counts()

2.0     4578
14.0    1057
3.0      904
4.0      552
5.0      490
8.0      275
15.0     271
11.0     184
1.0      135
24.0     121
12.0     101
7.0       88
6.0       70
9.0       55
13.0      54
10.0      44
17.0       6
16.0       2
19.0       1
Name: -999.0, dtype: int64

In [8]:
data.loc[:10]

Unnamed: 0,TrackP,TrackNDoFSubdetector2,BremDLLbeElectron,MuonLooseFlag,FlagSpd,SpdE,EcalDLLbeElectron,DLLmuon,RICHpFlagElectron,EcalDLLbeMuon,...,TrackNDoF,RICHpFlagMuon,RICH_DLLbeKaon,RICH_DLLbeElectron,HcalE,MuonFlag,FlagMuon,PrsE,RICH_DLLbeMuon,RICH_DLLbeProton
0,74791.156263,15.0,0.232275,1.0,1.0,3.2,-2.505719,6.604153,1.0,1.92996,...,28.0,1.0,-7.2133,-0.2802,5586.589846,1.0,1.0,10.422315,-2.081143e-07,-24.8244
1,2738.489989,15.0,-0.357748,0.0,1.0,3.2,1.864351,0.263651,1.0,-2.061959,...,32.0,1.0,-0.324317,1.707283,-7e-06,0.0,1.0,43.334935,2.771583,-0.648017
2,2161.409908,17.0,-999.0,0.0,0.0,-999.0,-999.0,-999.0,0.0,-999.0,...,27.0,0.0,-999.0,-999.0,-999.0,0.0,0.0,-999.0,-999.0,-999.0
3,15277.73049,20.0,-0.638984,0.0,1.0,3.2,-2.533918,-8.724949,1.0,-3.253981,...,36.0,1.0,-35.202221,-14.742319,4482.803707,0.0,1.0,2.194175,-3.070819,-29.291519
4,7563.700195,19.0,-0.638962,0.0,1.0,3.2,-2.087146,-7.060422,1.0,-0.995816,...,33.0,1.0,25.084287,-10.272412,5107.55468,0.0,1.0,1.5e-05,-5.373712,23.653087
5,62641.62109,17.0,0.976355,0.0,1.0,3.2,-2.649216,-3.767491,1.0,1.282086,...,40.0,1.0,29.475203,-3.059098,20529.441404,0.0,1.0,2.468433,-1.194598,1.010202
6,18872.81057,14.0,2.345886,0.0,1.0,3.2,-3.027858,-5.173245,1.0,0.750181,...,26.0,1.0,26.711504,-3.326296,19248.388672,0.0,1.0,2.742722,-1.859796,13.021704
7,1993.550048,3.0,0.170659,0.0,1.0,0.0,1.864349,0.101,1.0,0.382705,...,13.0,0.0,6e-06,-37.474493,694.30664,0.0,1.0,-1.5e-05,-0.244894,6e-06
8,90635.296871,8.0,-999.0,0.0,1.0,3.2,-999.0,2e-06,1.0,-999.0,...,22.0,1.0,-1.552902,0.561498,-999.0,0.0,1.0,119.85675,0.08879832,-3.197502
9,11633.669941,16.0,0.976349,0.0,1.0,0.0,-2.479154,-0.631769,1.0,0.449661,...,28.0,1.0,9.489098,-0.643303,913.806574,0.0,1.0,22.764572,-0.2466028,9.954897


In [71]:
data.loc[(data.BremDLLbeElectron != -999.0)]

Unnamed: 0,TrackP,TrackNDoFSubdetector2,BremDLLbeElectron,MuonLooseFlag,FlagSpd,SpdE,EcalDLLbeElectron,DLLmuon,RICHpFlagElectron,EcalDLLbeMuon,...,TrackNDoF,RICHpFlagMuon,RICH_DLLbeKaon,RICH_DLLbeElectron,HcalE,MuonFlag,FlagMuon,PrsE,RICH_DLLbeMuon,RICH_DLLbeProton
0,74791.156263,15.0,0.232275,1.0,1.0,3.2,-2.505719,6.604153,1.0,1.929960,...,28.0,1.0,-7.213300e+00,-0.280200,5586.589846,1.0,1.0,10.422315,-2.081143e-07,-2.482440e+01
1,2738.489989,15.0,-0.357748,0.0,1.0,3.2,1.864351,0.263651,1.0,-2.061959,...,32.0,1.0,-3.243169e-01,1.707283,-0.000007,0.0,1.0,43.334935,2.771583e+00,-6.480169e-01
3,15277.730490,20.0,-0.638984,0.0,1.0,3.2,-2.533918,-8.724949,1.0,-3.253981,...,36.0,1.0,-3.520222e+01,-14.742319,4482.803707,0.0,1.0,2.194175,-3.070819e+00,-2.929152e+01
4,7563.700195,19.0,-0.638962,0.0,1.0,3.2,-2.087146,-7.060422,1.0,-0.995816,...,33.0,1.0,2.508429e+01,-10.272412,5107.554680,0.0,1.0,0.000015,-5.373712e+00,2.365309e+01
5,62641.621090,17.0,0.976355,0.0,1.0,3.2,-2.649216,-3.767491,1.0,1.282086,...,40.0,1.0,2.947520e+01,-3.059098,20529.441404,0.0,1.0,2.468433,-1.194598e+00,1.010202e+00
6,18872.810570,14.0,2.345886,0.0,1.0,3.2,-3.027858,-5.173245,1.0,0.750181,...,26.0,1.0,2.671150e+01,-3.326296,19248.388672,0.0,1.0,2.742722,-1.859796e+00,1.302170e+01
7,1993.550048,3.0,0.170659,0.0,1.0,0.0,1.864349,0.101000,1.0,0.382705,...,13.0,0.0,5.960523e-06,-37.474493,694.306640,0.0,1.0,-0.000015,-2.448940e-01,5.960523e-06
9,11633.669941,16.0,0.976349,0.0,1.0,0.0,-2.479154,-0.631769,1.0,0.449661,...,28.0,1.0,9.489098e+00,-0.643303,913.806574,0.0,1.0,22.764572,-2.466028e-01,9.954897e+00
10,3432.929934,8.0,-0.542416,0.0,1.0,0.0,-999.000000,6.701406,1.0,-999.000000,...,27.0,1.0,5.715908e+00,-9.147492,-999.000000,0.0,0.0,2.742733,3.250301e+01,6.394708e+00
11,21985.539079,18.0,0.140034,0.0,1.0,3.2,-2.947558,-2.670759,1.0,-1.506076,...,33.0,1.0,1.720200e+00,2.555100,7930.257817,0.0,1.0,1.371361,4.344000e-01,7.361140e+01


In [64]:
(data.BremDLLbeElectron == -999).count()

1200000

In [62]:
np.max(data['BremDLLbeElectron'].value_counts())

249561

In [67]:
data2 = data.groupby('BremDLLbeElectron').count()

In [70]:
data2.shape

(950150, 49)

### Look at the labels set

In [10]:
set(data.Label)

{'Electron', 'Ghost', 'Kaon', 'Muon', 'Pion', 'Proton'}

### Define training features

Exclude `Label` from the features set

In [5]:
features = list(set(data.columns) - {'Label'})
features

['TrackNDoFSubdetector1',
 'RICH_DLLbeProton',
 'TrackNDoFSubdetector2',
 'DLLmuon',
 'FlagRICH1',
 'FlagSpd',
 'FlagRICH2',
 'DLLelectron',
 'GhostProbability',
 'TrackP',
 'RICHpFlagProton',
 'TrackDistanceToZ',
 'HcalDLLbeElectron',
 'TrackQualityPerNDoF',
 'FlagPrs',
 'TrackNDoF',
 'MuonLLbeMuon',
 'FlagBrem',
 'EcalDLLbeElectron',
 'EcalDLLbeMuon',
 'FlagHcal',
 'EcalShowerLongitudinalParameter',
 'RICHpFlagKaon',
 'MuonLLbeBCK',
 'Calo3dFitQuality',
 'FlagMuon',
 'BremDLLbeElectron',
 'MuonFlag',
 'MuonLooseFlag',
 'DLLproton',
 'RICH_DLLbeMuon',
 'RICH_DLLbeKaon',
 'PrsDLLbeElectron',
 'DLLkaon',
 'EcalE',
 'FlagEcal',
 'RICHpFlagElectron',
 'Calo2dFitQuality',
 'HcalE',
 'TrackQualitySubdetector1',
 'PrsE',
 'HcalDLLbeMuon',
 'RICH_DLLbeElectron',
 'RICH_DLLbeBCK',
 'TrackPt',
 'RICHpFlagMuon',
 'TrackQualitySubdetector2',
 'RICHpFlagPion',
 'SpdE']

### Divide training data into 2 parts

In [6]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier

In [None]:
training_data, validation_data = train_test_split(data, random_state=11, train_size=0.1, test_size = 0.9)

In [16]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

X_train = scaler.fit_transform(training_data[features])
X_val = scaler.fit_transform(validation_data[features])

In [17]:
len(training_data), len(validation_data)

(120000, 1080000)

In [48]:
tree = DecisionTreeClassifier()
tree.fit(X_train, training_data.Label)
proba = tree.predict_proba(X_val)
log_loss(validation_data.Label, proba)

12.831347312800409

In [51]:
clf = ExtraTreesClassifier(n_estimators=500, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=5)
clf.fit(X_train, training_data.Label)
proba = clf.predict_proba(X_val)
log_loss(validation_data.Label, proba)

0.68939210045137778

In [382]:
data2 = data[(s[True] < 16)  | (s[True].isna())]
data3 = data[s[True] > 15]
print(data2.shape)
print(data3.shape)

(1185034, 50)
(14966, 50)


In [387]:
training_data, aaa = train_test_split(data2, random_state=11, train_size=0.2, test_size = 0.0)
bbb, validation_data = train_test_split(data, random_state=250, train_size=0.00, test_size = 0.2)

X_train = scaler.fit_transform(training_data[features])
X_val = scaler.fit_transform(validation_data[features])

In [388]:
print(X_train.shape)
print(X_val.shape)

(237006, 49)
(240000, 49)


In [386]:
X_train = scaler.fit_transform(data[features])
X_train.shape

(1200000, 49)

In [389]:
clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=5)
clf.fit(X_train, training_data.Label)
proba = clf.predict_proba(X_val)
log_loss(validation_data.Label, proba)

1.0413836076932936

In [None]:
clf = KNeighborsClassifier(n_neighbors=5, leaf_size=30,
                           p=2, metric_params=None, n_jobs=4)
clf.fit(X_train, training_data.Label)
proba = clf.predict_proba(X_val)
log_loss(validation_data.Label, proba)

In [30]:
param_grid ={'min_samples_leaf': [1, 10]}
gscv = GridSearchCV(clf, param_grid, scoring='neg_log_loss', cv=3)
#gscv = GridSearchCV(clf, param_grid, scoring='neg_log_loss', cv=3, n_jobs=-1, verbose=1)
gscv.fit(X_train, training_data.Label)

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=6, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_leaf': [1, 10]}, pre_dispatch='2*n_jobs',
       refit=True, return_train_score=True, scoring='neg_log_loss',
       verbose=0)

In [28]:
gscv.best_params_

{'n_estimators': 2}

In [10]:
from sklearn.ensemble import GradientBoostingClassifier as XGBClassifier

In [27]:
clf2 = XGBClassifier(learning_rate= 0.3, n_estimators=50, subsample=1.0,
                            min_samples_split=2, min_samples_leaf = 3, 
                           min_weight_fraction_leaf=0.0, max_depth=3,
                           min_impurity_split=None, init=None, random_state=None,
                           max_features=None, verbose=0, max_leaf_nodes=None, warm_start=False)

clf2.fit(X_train, training_data.Label)
proba = clf2.predict_proba(X_val)
print(log_loss(validation_data.Label, proba))

In [29]:
from sklearn.model_selection import GridSearchCV
param_grid ={'max_depth': np.linspace(1,9,5)}
gscv = GridSearchCV(clf2, param_grid, scoring='neg_log_loss', cv=3)
#gscv = GridSearchCV(clf, param_grid, scoring='neg_log_loss', cv=3, n_jobs=-1, verbose=1)
gscv.fit(X_train, training_data.Label)

KeyboardInterrupt: 

In [30]:
gscv.fit(X_train, training_data.Label)

KeyboardInterrupt: 

In [28]:
gscv.best_params_

0.74428425925925923

### Simple logistic regression forest from `sklearn` training

train multiclassification model

In [46]:
from xgboost import XGBClassifier,XGBRegressor
from catboost import CatBoostClassifier, CatBoostRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

ImportError: No module named xgboost

In [45]:
!pip install xgboost > xgboost.log

[31mCommand "python setup.py egg_info" failed with error code 1 in /private/var/folders/_9/tp1qlhsd16ldfxdjgfhzlr3h0000gn/T/pip-build-OtOfJB/xgboost/[0m


In [336]:
training_data, validation_data = train_test_split(data, random_state=11, train_size=0.1, test_size = 0.1)
X_train = scaler.fit_transform(training_data[features])
print(X_train.shape)
clf = RandomForestClassifier(n_estimators=100, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=5)
clf.fit(X_train, training_data.Label)

(120000, 49)


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=5, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

Train best model:

### Evaluate predictions on the validation sample

In [30]:
# predict each track
proba = model.predict_proba(X_val)

### Log loss on the cross validation sample

In [31]:
log_loss(validation_data.Label, proba)

0.69576085141765509

## Prepare submission to kaggle

In [337]:
# predict test sample
X_test = scaler.fit_transform(test[features])
kaggle_proba = clf.predict_proba(X_test)
kaggle_ids = test.ID

In [338]:
from IPython.display import FileLink

def create_solution(ids, proba, names, filename='baseline.csv'):
    """saves predictions to file and provides a link for downloading """
    solution = pd.DataFrame({'ID': ids})
    
    for name in ['Ghost', 'Electron', 'Muon', 'Pion', 'Kaon', 'Proton']:
        solution[name] = proba[:, np.where(names == name)[0]]
    
    solution.to_csv('{}'.format(filename), index=False)
    return FileLink('{}'.format(filename))
    
create_solution(kaggle_ids, kaggle_proba, clf.classes_)