In [1]:
%matplotlib inline
from sklearn.cross_validation import train_test_split
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import string
import seaborn as sns
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.grid_search import GridSearchCV

In [3]:
train = pd.read_csv('input/tr.csv')
test = pd.read_csv('input/test.csv')
all_data = pd.concat([train, test])

In [4]:
from sklearn.preprocessing import LabelEncoder
def encode(train, test):
    le = LabelEncoder().fit(train.species) 
    labels = le.transform(train.species)           # encode species strings
    classes = list(le.classes_)                    # save column names for submission
    test_ids = test.id                             # save test ids for submission
    
    train = train.drop(['species', 'id'], axis=1)  
    test = test.drop(['id'], axis=1)
    
    return train, labels, test, test_ids, classes

train, labels, test, test_ids, classes = encode(train, test)
train.head(1)

Unnamed: 0,margin1,margin2,margin3,margin4,margin5,margin6,margin7,margin8,margin9,margin10,...,texture55,texture56,texture57,texture58,texture59,texture60,texture61,texture62,texture63,texture64
0,0.007812,0.023438,0.023438,0.003906,0.011719,0.009766,0.027344,0.0,0.001953,0.033203,...,0.007812,0.0,0.00293,0.00293,0.035156,0.0,0.0,0.004883,0.0,0.025391


In [4]:
from sklearn.cross_validation import StratifiedShuffleSplit
sss = StratifiedShuffleSplit(labels, 10, test_size=0.10, random_state=23)

for train_index, test_index in sss:
    X_train, X_test = train.values[train_index], train.values[test_index]
    y_train, y_test = labels[train_index], labels[test_index]

In [5]:
sss_cal = StratifiedShuffleSplit(y_train, 10, test_size=0.15, random_state=32)
for train_i, cal_i in sss_cal:
    X_clf, X_cal = X_train[train_i], X_train[cal_i]
    y_clf, y_cal = y_train[train_i], y_train[cal_i]
    

In [32]:
from sklearn.ensemble import RandomForestClassifier as Rf
params = {
    'n_estimators':[100, 250, 400], 
    'criterion': ['gini', 'entropy'],
    'max_depth':[None, 1, 3, 5, 10, 20]
    }

gs = GridSearchCV(Rf(), param_grid=params, scoring='log_loss')

In [33]:
gs.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'max_depth': [None, 1, 3, 5, 10, 20], 'n_estimators': [100, 250, 400], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, scoring='log_loss', verbose=0)

In [37]:
gs.best_params_ 

{'criterion': 'gini', 'max_depth': None, 'n_estimators': 400}

In [38]:
from sklearn.metrics import accuracy_score, log_loss

best_rf = gs.best_estimator_
predictions = best_rf.predict(X_test)
acc = accuracy_score(y_test, predictions)
prob = best_rf.predict_proba(X_test)
logl = log_loss(y_test, prob)
print(acc)
print(logl)

0.969696969697
0.757715436382


In [39]:
rf_for_cal = Rf(criterion='gini', max_depth= None, n_estimators= 400)

In [40]:
from sklearn.calibration import CalibratedClassifierCV
cal_cv = CalibratedClassifierCV(base_estimator=rf_for_cal, method='isotonic')

In [41]:
cal_cv.fit(X_train, y_train)

CalibratedClassifierCV(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
            cv=3, method='isotonic')

In [42]:
sig_clf_probs = cal_cv.predict_proba(X_test)

In [43]:
log_loss(y_test, sig_clf_probs)

0.13874800404617035

In [44]:
cal_cv.fit(train.values, labels)

CalibratedClassifierCV(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
            cv=3, method='isotonic')

In [45]:
cal_test_predictions = cal_cv.predict_proba(test)

In [46]:
submission = pd.DataFrame(cal_test_predictions, columns=classes)

In [47]:
submission.insert(0, 'id', test_ids)

In [48]:
submission.tail()

Unnamed: 0,id,Acer_Capillipes,Acer_Circinatum,Acer_Mono,Acer_Opalus,Acer_Palmatum,Acer_Pictum,Acer_Platanoids,Acer_Rubrum,Acer_Rufinerve,...,Salix_Fragilis,Salix_Intergra,Sorbus_Aria,Tilia_Oliveri,Tilia_Platyphyllos,Tilia_Tomentosa,Ulmus_Bergmanniana,Viburnum_Tinus,Viburnum_x_Rhytidophylloides,Zelkova_Serrata
589,1576,0.0,0.997475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
590,1577,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
591,1579,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
592,1580,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
593,1583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
submission.to_csv('cal_rf_3.csv', index=False)

In [None]:
preds.shape
withSpecies = pd.DataFrame(preds, columns= train.species.unique()).sort_index(axis=1)
withSpecies['id'] = test.id

In [None]:
cols = list(withSpecies)
cols.insert(0, cols.pop(cols.index('id')))
withSpecies = withSpecies.ix[:, cols]

In [None]:
withSpecies.to_csv('one.csv', index=False)