In [57]:
%matplotlib inline
from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy as np
import string
import seaborn as sns
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV

In [58]:
train = pd.read_csv('input/tr.csv')
test = pd.read_csv('input/test.csv')
all_data = pd.concat([train, test])

In [59]:
from sklearn.preprocessing import LabelEncoder
def encode(train, test):
    le = LabelEncoder().fit(train.species) 
    labels = le.transform(train.species)           # encode species strings
    classes = list(le.classes_)                    # save column names for submission
    test_ids = test.id                             # save test ids for submission
    
    train = train.drop(['species', 'id'], axis=1)  
    test = test.drop(['id'], axis=1)
    
    return train, labels, test, test_ids, classes

train, labels, test, test_ids, classes = encode(train, test)
train.head(1)

Unnamed: 0,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,margin10,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,0.001953,0.033203,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391


In [60]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler().fit(train)
train = sc.transform(train)



from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(labels, 10, test_size=0.20, random_state=23)

for train_index, val_index in sss:
    X_train, X_val = train[train_index], train[val_index]
    y_train, y_val = labels[train_index], labels[val_index]

In [61]:
from sklearn.linear_model import LogisticRegression as lr
params = {
    'C':[1000, 2000, 3000], 
    'tol': [0.001, 0.01]
    }

gs = GridSearchCV(lr(penalty='l2', solver='newton-cg', multi_class='multinomial'), param_grid=params, scoring='log_loss')

In [62]:
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.0001, verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'tol': [0.001, 0.01], 'C': [1000, 2000, 3000]},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [63]:
gs.best_params_ 

{'C': 2000, 'tol': 0.001}

In [64]:
from sklearn.metrics import accuracy_score, log_loss

best_lr = gs.best_estimator_
predictions = best_lr.predict(X_val)
acc = accuracy_score(y_val, predictions)
prob = best_lr.predict_proba(X_val)
logloss = log_loss(y_val, prob)
print(acc)
print(logloss)

0.989898989899
0.0221098564026


In [66]:
lr_for_submission = lr(C=2000, penalty='l2', tol=0.001, solver='newton-cg', multi_class='multinomial')
lr_for_submission.fit(train, labels)

LogisticRegression(C=2000, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='multinomial',
          n_jobs=1, penalty='l2', random_state=None, solver='newton-cg',
          tol=0.001, verbose=0, warm_start=False)

In [67]:
scaler = StandardScaler().fit(test)
test = scaler.transform(test)
test_predictions = lr_for_submission.predict_proba(test)

In [68]:
submission = pd.DataFrame(test_predictions, columns=classes)

In [69]:
submission.insert(0, 'id', test_ids)

In [70]:
submission.tail()

Unnamed: 0,id,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
589,1576,1.202762e-07,0.9999278,2.553694e-08,1.587918e-10,4.054814e-05,1.18832e-09,2.745718e-10,6.067553e-07,1.940357e-06,...,9.179105e-09,5.487835e-09,9.850891e-09,1.101469e-09,1.093571e-10,8.685867e-10,3.375683e-07,3.880784e-11,6.979956e-10,6.386308e-06
590,1577,3.89318e-08,4.110406e-08,2.110079e-10,1.124857e-07,3.761114e-09,2.845141e-11,5.056677e-09,1.353143e-05,5.353361e-05,...,6.044481e-09,2.596397e-09,6.18357e-06,7.140593e-10,1.75457e-05,1.566897e-05,2.286367e-07,5.894726e-11,8.792628e-10,1.609163e-05
591,1579,8.139623e-10,2.498155e-10,4.556931e-11,3.116939e-13,2.49286e-09,4.643892e-09,1.567921e-12,2.161275e-09,1.907297e-10,...,8.578305e-14,1.197044e-13,6.751291e-10,6.630584e-11,4.156543e-10,4.657194e-15,2.617162e-13,5.128736e-11,4.080694e-10,6.168079e-09
592,1580,7.517675e-10,1.791131e-10,2.369757e-08,3.282074e-09,9.795146e-09,1.313835e-11,3.969928e-08,2.205755e-07,4.711533e-11,...,3.679447e-08,5.48217e-07,4.762261e-12,2.085401e-07,5.871543e-12,2.207742e-08,1.850052e-10,4.692398e-10,9.631175e-13,6.984634e-11
593,1583,8.846914e-13,6.852121e-09,4.107826e-09,1.484884e-09,4.685332e-10,5.264904e-07,1.997298e-08,5.679432e-09,1.016609e-09,...,1.511257e-10,1.333899e-11,3.545895e-11,3.80979e-10,2.188963e-09,2.722873e-09,1.66948e-11,1.651252e-10,2.028744e-12,5.074998e-09


In [71]:
submission.to_csv('lr_2.csv', index=False)

In [None]:
preds.shape
withSpecies = pd.DataFrame(preds, columns= train.species.unique()).sort_index(axis=1)
withSpecies['id'] = test.id

In [None]:
cols = list(withSpecies)
cols.insert(0, cols.pop(cols.index('id')))
withSpecies = withSpecies.ix[:, cols]

In [None]:
withSpecies.to_csv('one.csv', index=False)