In [1]:
import numpy as np
import pandas as pd
import matplotlib.pylab as plt
import seaborn as sns
%matplotlib inline

#### Read data 

In [2]:
train_1 = pd.read_csv('data/train1.csv', sep='\t')
train_2 = pd.read_csv('data/train2.csv', sep='\t')
train_3 = pd.read_csv('data/train3.csv', sep='\t')

test_1 = pd.read_csv('data/test1.csv', sep='\t')
test_2 = pd.read_csv('data/test2.csv', sep='\t')
test_3 = pd.read_csv('data/test3.csv', sep='\t')

#### Prepare columns, feature engineering 

In [3]:
train = train_1.copy()
for c in train_2.columns:
    if c not in train_1.columns:
        train[c] = train_2[c]
for c in train_3.columns:
    if c not in train_1.columns:
        train[c] = train_3[c]

test = test_1.copy()
for c in test_2.columns:
    if c not in test_1.columns:
        test[c] = test_2[c]
for c in test_3.columns:
    if c not in test_1.columns:
        test[c] = test_3[c]

In [4]:
def prepare_columns(data):
    data.drop(['mean_scale', 'mean_angle', 'mymax(angles_abs)', 
               'mymin(angles_abs)', 'set_index', 'set_name'], 
              axis=1, 
              inplace=True)
    
    data.rename(columns={'getParamOrderAdd(angles)': 'angle_ordered', 
                     'getParamOrderMult(scales)': 'scale_ordered',
                     ' mymin(angles_abs)_relative': 'mymin(angles_abs)_relative'}, 
                inplace=True)
    
    dummy_columns = ['correct_scale', 'correct_angle', 'has_median_angle', 
                 'has_extreme_angle', 'mean_scale_more_then_1', 
                 'angle_ordered', 'scale_ordered',
                 'has_median_angle_scaled', 'has_min_related_angle']
    
    return pd.get_dummies(data, columns=dummy_columns)

In [5]:
train = prepare_columns(train)
test = prepare_columns(test)

In [6]:
train_x = train.drop('image_id', axis=1).astype(np.float64)
test = test.drop('image_id', axis=1).astype(np.float64)

In [7]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
train_x = sc.fit_transform(train_x)
test = sc.transform(test)

In [8]:
train_y_5 = train['image_id'].astype(np.int32).values
train_y_3 = [1 if (x == 1 or x == 2) else 2 if (x == 3) else 3 for x in train_y_5]

### Models 

In [39]:
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier

from sklearn.svm import SVC

from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid

from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB

In [124]:
x_tr, x_cv, y_tr, y_cv = train_test_split(train_x, train_y_3, test_size = 0.33, random_state = 42)

In [40]:
def get_value_counts(preds, n):

    if (n == 3): 
        preds = filter(lambda a: a in [1, 2, 3], preds)
    elif (n == 5): 
        preds = filter(lambda a: a in [1, 2, 3, 4, 5], preds)
            
    vals = pd.Series(preds).value_counts().values.tolist()
    for i in range(n - len(vals)):
        vals.append(0)
    return vals
    
def get_numbers_module(preds):
    c = 0

    test_sets = []
    cur_list = []
    for i in range(len(preds)):
        cur_list.append(preds[i])
        if((i+1) % 5 == 0):
            test_sets.append(cur_list[:])
            cur_list[:] = []
    
    for s in test_sets:
        for x in range(1,6):
            c += abs(s.count(x)-1)
    return c
    
def check_models_score(cls):
    etal_3 = [548, 548, 274]
    etal_5 = [274, 274, 274, 274, 274]
    print ('absnum_3 \t absnum_5 \t mse_3 \t mse_5 \t cv_3 \t cv_5')
    for cl in cls:
        cv3 = cross_val_score(cl, train_x, train_y_3, cv=3).mean()
        cv5 = cross_val_score(cl, train_x, train_y_5, cv=3).mean()
        
        cl.fit(train_x, train_y_3)
        pred3 = [np.rint(x) for x in cl.predict(test)]
        mse3 = mean_squared_error(etal_3, get_value_counts(pred3, 3))
        absnum3 = get_numbers_module(pred3)
        
        cl.fit(train_x, train_y_5)
        pred5 = [np.rint(x) for x in cl.predict(test)]
        mse5 = mean_squared_error(etal_5, get_value_counts(pred5, 5))
        absnum5 = get_numbers_module(pred5)
        
        print ('%f \t %f \t %f \t %f \t %f \t %f') % (absnum3, absnum5, mse3, mse5, cv3, cv5)

#### Linear models

In [41]:
log_r = LogisticRegression()
lasso = Lasso()
lin_r = LinearRegression()
perc = Perceptron()
ridge = Ridge()
sgd = SGDClassifier()

In [42]:
check_models_score([log_r, elastic, lasso, lin_r, perc, ridge, sgd])

absnum_3 	 absnum_5 	 mse_3 	 mse_5 	 cv_3 	 cv_5
1212.000000 	 576.000000 	 312.000000 	 5432.400000 	 0.985656 	 0.905314
2188.000000 	 1143.000000 	 348712.666667 	 55741.000000 	 0.511157 	 0.653116
2192.000000 	 2192.000000 	 350354.666667 	 300304.000000 	 -0.000412 	 0.233291
1474.000000 	 1056.000000 	 16436.666667 	 42090.000000 	 0.779314 	 0.544285
1270.000000 	 794.000000 	 10766.000000 	 5864.000000 	 0.971384 	 0.885386
1511.000000 	 1045.000000 	 21274.666667 	 42747.800000 	 0.942933 	 0.902968
1272.000000 	 754.000000 	 3330.666667 	 15057.200000 	 0.974283 	 0.894444


#### Ensembles

In [43]:
ada = AdaBoostClassifier()
bag = BaggingClassifier()
extr = ExtraTreesClassifier()
gbc = GradientBoostingClassifier()
rf = RandomForestClassifier()
vc_ens_soft = VotingClassifier(estimators=[('ada', ada), ('bag', bag), ('extr', extr), ('gbc', gbc), ('rf', rf)], voting='soft')
vc_ens_hard = VotingClassifier(estimators=[('ada', ada), ('bag', bag), ('extr', extr), ('gbc', gbc), ('rf', rf)], voting='hard')

In [45]:
check_models_score([ada, bag, extr, gbc, rf, vc_ens_soft, vc_ens_hard])

absnum_3 	 absnum_5 	 mse_3 	 mse_5 	 cv_3 	 cv_5
1600.000000 	 2004.000000 	 37204.666667 	 238942.800000 	 0.982806 	 0.652657
2038.000000 	 1412.000000 	 271530.666667 	 91195.200000 	 0.991403 	 0.916546
1360.000000 	 606.000000 	 9068.666667 	 6342.400000 	 0.982978 	 0.899638
1802.000000 	 1002.000000 	 186452.666667 	 36532.800000 	 0.988455 	 0.899396
1314.000000 	 828.000000 	 7922.666667 	 18900.800000 	 0.988579 	 0.910990
1484.000000 	 1286.000000 	 21248.666667 	 73712.000000 	 0.991354 	 0.922343
1372.000000 	 940.000000 	 12158.000000 	 35303.200000 	 0.994252 	 0.928140


#### SVM 

In [46]:
svc = SVC()

In [47]:
check_models_score([svc])

absnum_3 	 absnum_5 	 mse_3 	 mse_5 	 cv_3 	 cv_5
1340.000000 	 330.000000 	 6044.666667 	 2114.800000 	 0.977181 	 0.910870


#### Distance models

In [48]:
knn = KNeighborsClassifier()
nc = NearestCentroid()

In [49]:
check_models_score([knn, nc])

absnum_3 	 absnum_5 	 mse_3 	 mse_5 	 cv_3 	 cv_5
1170.000000 	 350.000000 	 116.666667 	 1605.200000 	 0.979982 	 0.913889
1146.000000 	 430.000000 	 1292.666667 	 1932.400000 	 0.968314 	 0.870531


#### Naive Bayes 

In [50]:
ber = BernoulliNB()
gnb = GaussianNB()

In [51]:
check_models_score([ber, gnb])

absnum_3 	 absnum_5 	 mse_3 	 mse_5 	 cv_3 	 cv_5
1180.000000 	 512.000000 	 11624.000000 	 3700.800000 	 0.965416 	 0.867754
1290.000000 	 918.000000 	 2818.666667 	 23903.600000 	 0.851367 	 0.701087


### Compare mse best 

In [52]:
check_models_score([ber, gnb, knn, nc, svc, extr, log_r, perc, sgd])

absnum_3 	 absnum_5 	 mse_3 	 mse_5 	 cv_3 	 cv_5
1180.000000 	 512.000000 	 11624.000000 	 3700.800000 	 0.965416 	 0.867754
1290.000000 	 918.000000 	 2818.666667 	 23903.600000 	 0.851367 	 0.701087
1170.000000 	 350.000000 	 116.666667 	 1605.200000 	 0.979982 	 0.913889
1146.000000 	 430.000000 	 1292.666667 	 1932.400000 	 0.968314 	 0.870531
1340.000000 	 330.000000 	 6044.666667 	 2114.800000 	 0.977181 	 0.910870
1202.000000 	 628.000000 	 760.666667 	 9544.000000 	 0.971434 	 0.928019
1212.000000 	 576.000000 	 312.000000 	 5432.400000 	 0.985656 	 0.905314
1270.000000 	 794.000000 	 10766.000000 	 5864.000000 	 0.971384 	 0.885386
1198.000000 	 686.000000 	 2238.000000 	 6355.600000 	 0.971458 	 0.896739


### XGB one-vs-rest 

In [51]:
import xgboost as xgb

params = {"objective": "binary:logistic",
          "booster": "gbtree",
          "eval_metric": "auc",
          "eta": 0.05,
          "max_depth": 6,
          "subsample": 0.9,
          "colsample_bytree": 0.8,
          "silent": 1
          }
num_trees = 100

#### 1-vs-other 

In [54]:
len(train_y_5)

350

In [60]:
tr_id_1or2 = [i for i in range(len(train_y_5)) if train_y_5[i] == 1 or train_y_5[i] == 2]
train_y_is_1 = [1 if x == 1 else 0 for x in train_y_5[tr_id_1or2]]

In [62]:
tr_id_1or2 = [i for i in range(len(train_y_5)) if train_y_5[i] == 1 or train_y_5[i] == 2]
train_y_is_1 = [1 if x == 1 else 0 for x in train_y_5[tr_id_1or2]]
x_tr, x_cv, y_tr, y_cv = train_test_split(train_x[tr_id_1or2], train_y_is_1, test_size = 0.33, random_state = 42)

dtrain = xgb.DMatrix(x_tr, y_tr)
dvalid = xgb.DMatrix(x_cv, y_cv)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm1 = xgb.train(params, dtrain, num_trees, evals=watchlist, verbose_eval=True)

[0]	eval-auc:0.876880	train-auc:0.968021
[1]	eval-auc:0.905075	train-auc:0.991363
[2]	eval-auc:0.922932	train-auc:0.993697
[3]	eval-auc:0.941729	train-auc:0.995331
[4]	eval-auc:0.925752	train-auc:0.995798
[5]	eval-auc:0.937030	train-auc:0.995798
[6]	eval-auc:0.930451	train-auc:0.995798
[7]	eval-auc:0.930451	train-auc:0.995798
[8]	eval-auc:0.935150	train-auc:0.995798
[9]	eval-auc:0.942669	train-auc:0.995798
[10]	eval-auc:0.952068	train-auc:0.995798
[11]	eval-auc:0.940789	train-auc:0.995798
[12]	eval-auc:0.937030	train-auc:0.995798
[13]	eval-auc:0.940789	train-auc:0.995798
[14]	eval-auc:0.937030	train-auc:0.995798
[15]	eval-auc:0.950188	train-auc:0.995798
[16]	eval-auc:0.940789	train-auc:0.996265
[17]	eval-auc:0.937030	train-auc:0.996265
[18]	eval-auc:0.938910	train-auc:0.997666
[19]	eval-auc:0.937030	train-auc:0.997199
[20]	eval-auc:0.937030	train-auc:0.997199
[21]	eval-auc:0.931391	train-auc:0.996732
[22]	eval-auc:0.931391	train-auc:0.996732
[23]	eval-auc:0.931391	train-auc:0.997666
[2

#### 2-vs-other 

In [63]:
train_y_is_2 = [1 if x == 2 else 0 for x in train_y_5[tr_id_1or2]]
x_tr, x_cv, y_tr, y_cv = train_test_split(train_x[tr_id_1or2], train_y_is_2, test_size = 0.33, random_state = 42)

dtrain = xgb.DMatrix(x_tr, y_tr)
dvalid = xgb.DMatrix(x_cv, y_cv)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm2 = xgb.train(params, dtrain, num_trees, evals=watchlist, verbose_eval=True)

[0]	eval-auc:0.876880	train-auc:0.968021
[1]	eval-auc:0.905075	train-auc:0.991363
[2]	eval-auc:0.922932	train-auc:0.993697
[3]	eval-auc:0.941729	train-auc:0.995331
[4]	eval-auc:0.925752	train-auc:0.995798
[5]	eval-auc:0.937030	train-auc:0.995798
[6]	eval-auc:0.930451	train-auc:0.995798
[7]	eval-auc:0.930451	train-auc:0.995798
[8]	eval-auc:0.935150	train-auc:0.995798
[9]	eval-auc:0.942669	train-auc:0.995798
[10]	eval-auc:0.952068	train-auc:0.995798
[11]	eval-auc:0.940789	train-auc:0.995798
[12]	eval-auc:0.937030	train-auc:0.995798
[13]	eval-auc:0.940789	train-auc:0.995798
[14]	eval-auc:0.937030	train-auc:0.995798
[15]	eval-auc:0.950188	train-auc:0.995798
[16]	eval-auc:0.940789	train-auc:0.996265
[17]	eval-auc:0.937030	train-auc:0.996265
[18]	eval-auc:0.938910	train-auc:0.997666
[19]	eval-auc:0.937030	train-auc:0.997199
[20]	eval-auc:0.937030	train-auc:0.997199
[21]	eval-auc:0.931391	train-auc:0.996732
[22]	eval-auc:0.931391	train-auc:0.996732
[23]	eval-auc:0.931391	train-auc:0.997666
[2

#### 3-vs-other 

In [332]:
x_tr, x_cv, y_tr, y_cv = train_test_split(train_x, train_y_is_3, test_size = 0.33, random_state = 42)

dtrain = xgb.DMatrix(x_tr, y_tr)
dvalid = xgb.DMatrix(x_cv, y_cv)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm3 = xgb.train(params, dtrain, num_trees, evals=watchlist, verbose_eval=True)

[0]	eval-auc:1	train-auc:0.999761
[1]	eval-auc:1	train-auc:0.999761
[2]	eval-auc:1	train-auc:0.99988
[3]	eval-auc:1	train-auc:0.99988
[4]	eval-auc:1	train-auc:0.99988
[5]	eval-auc:1	train-auc:0.99988
[6]	eval-auc:1	train-auc:0.99988
[7]	eval-auc:1	train-auc:0.99988
[8]	eval-auc:1	train-auc:0.99988
[9]	eval-auc:1	train-auc:0.99988
[10]	eval-auc:1	train-auc:0.99988
[11]	eval-auc:1	train-auc:0.99988
[12]	eval-auc:1	train-auc:0.99988
[13]	eval-auc:1	train-auc:0.99988
[14]	eval-auc:1	train-auc:0.99988
[15]	eval-auc:1	train-auc:0.99988
[16]	eval-auc:1	train-auc:0.99988
[17]	eval-auc:1	train-auc:0.99988
[18]	eval-auc:1	train-auc:0.99988
[19]	eval-auc:1	train-auc:0.99988
[20]	eval-auc:1	train-auc:1
[21]	eval-auc:1	train-auc:1
[22]	eval-auc:1	train-auc:1
[23]	eval-auc:1	train-auc:1
[24]	eval-auc:1	train-auc:1
[25]	eval-auc:1	train-auc:1
[26]	eval-auc:1	train-auc:1
[27]	eval-auc:1	train-auc:1
[28]	eval-auc:1	train-auc:1
[29]	eval-auc:1	train-auc:1
[30]	eval-auc:1	train-auc:1
[31]	eval-auc:1	trai

#### 4-vs-other 

In [65]:
tr_id_4or5 = [i for i in range(len(train_y_5)) if train_y_5[i] == 4 or train_y_5[i] == 5]
train_y_is_4 = [1 if x == 4 else 0 for x in train_y_5[tr_id_4or5]]
x_tr, x_cv, y_tr, y_cv = train_test_split(train_x[tr_id_4or5], train_y_is_4, test_size = 0.33, random_state = 42)

dtrain = xgb.DMatrix(x_tr, y_tr)
dvalid = xgb.DMatrix(x_cv, y_cv)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm4 = xgb.train(params, dtrain, num_trees, evals=watchlist, verbose_eval=True)

[0]	eval-auc:0.954887	train-auc:0.978525
[1]	eval-auc:0.968985	train-auc:0.995798
[2]	eval-auc:0.990601	train-auc:0.995798
[3]	eval-auc:0.990601	train-auc:0.996265
[4]	eval-auc:0.990601	train-auc:0.996265
[5]	eval-auc:0.990601	train-auc:0.996732
[6]	eval-auc:0.986842	train-auc:0.998366
[7]	eval-auc:0.986842	train-auc:0.998366
[8]	eval-auc:0.986842	train-auc:0.998366
[9]	eval-auc:0.986842	train-auc:0.998366
[10]	eval-auc:0.986842	train-auc:0.998133
[11]	eval-auc:0.986842	train-auc:0.997899
[12]	eval-auc:0.986842	train-auc:0.997899
[13]	eval-auc:0.986842	train-auc:0.997899
[14]	eval-auc:0.986842	train-auc:0.998133
[15]	eval-auc:0.986842	train-auc:0.998599
[16]	eval-auc:0.990601	train-auc:0.998133
[17]	eval-auc:0.986842	train-auc:0.998133
[18]	eval-auc:0.990601	train-auc:0.998133
[19]	eval-auc:0.992481	train-auc:0.998133
[20]	eval-auc:0.990601	train-auc:0.998599
[21]	eval-auc:0.992481	train-auc:0.998133
[22]	eval-auc:0.992481	train-auc:0.998599
[23]	eval-auc:0.992481	train-auc:0.998599
[2

#### 5-vs-other 

In [66]:
train_y_is_5 = [1 if x == 5 else 0 for x in train_y_5[tr_id_4or5]]
x_tr, x_cv, y_tr, y_cv = train_test_split(train_x[tr_id_4or5], train_y_is_5, test_size = 0.33, random_state = 42)

dtrain = xgb.DMatrix(x_tr, y_tr)
dvalid = xgb.DMatrix(x_cv, y_cv)
watchlist = [(dvalid, 'eval'), (dtrain, 'train')]
gbm5 = xgb.train(params, dtrain, num_trees, evals=watchlist, verbose_eval=True)

[0]	eval-auc:0.954887	train-auc:0.978525
[1]	eval-auc:0.968985	train-auc:0.995798
[2]	eval-auc:0.990601	train-auc:0.995798
[3]	eval-auc:0.990601	train-auc:0.996265
[4]	eval-auc:0.990601	train-auc:0.996265
[5]	eval-auc:0.990601	train-auc:0.996732
[6]	eval-auc:0.986842	train-auc:0.998366
[7]	eval-auc:0.986842	train-auc:0.998366
[8]	eval-auc:0.986842	train-auc:0.998366
[9]	eval-auc:0.986842	train-auc:0.998366
[10]	eval-auc:0.986842	train-auc:0.998133
[11]	eval-auc:0.986842	train-auc:0.997899
[12]	eval-auc:0.986842	train-auc:0.997899
[13]	eval-auc:0.986842	train-auc:0.997899
[14]	eval-auc:0.986842	train-auc:0.998133
[15]	eval-auc:0.986842	train-auc:0.998599
[16]	eval-auc:0.990601	train-auc:0.998133
[17]	eval-auc:0.986842	train-auc:0.998133
[18]	eval-auc:0.990601	train-auc:0.998133
[19]	eval-auc:0.992481	train-auc:0.998133
[20]	eval-auc:0.990601	train-auc:0.998599
[21]	eval-auc:0.992481	train-auc:0.998133
[22]	eval-auc:0.992481	train-auc:0.998599
[23]	eval-auc:0.992481	train-auc:0.998599
[2

### Make prediction 

#### Predict 12 - 3- 45 classes

In [132]:
from sklearn.decomposition import PCA
from sklearn.svm import SVC

clf = Pipeline([
  ('feature_selection', SelectFromModel(LinearSVC(penalty="l1"))),
  ('classification', RandomForestClassifier())
])

In [130]:
pd_preds = pd.Series(preds_5)

In [131]:
pd_preds.value_counts()

4    309
5    306
1    292
2    233
3    230
dtype: int64

In [69]:
idx_1or2 = pd_preds[pd_preds == 1].index.values
idx_3 = pd_preds[pd_preds == 2].index.values
idx_4or5 = pd_preds[pd_preds == 3].index.values

pd_preds[idx_4or5] = 5
pd_preds[idx_3] = 3

In [71]:
test_1or2 = test[idx_1or2]
test_4or5 = test[idx_4or5]

#### Predict 1 class in 1-2 classes 

In [72]:
dtest = xgb.DMatrix(test_1or2)
XGB1 = gbm1.predict(dtest)
indices = XGB1 < 0
XGB1[indices] = 0

#### Predict 2 class in 1-2 classes 

In [73]:
XGB2 = gbm2.predict(dtest)
indices = XGB2 < 0
XGB2[indices] = 0

In [74]:
print len([x for x in XGB1 if np.rint(x) == 1])
print len([x for x in XGB2 if np.rint(x) == 1])

293
245


In [75]:
preds_1or2 = []
for i in range(len(XGB1)):
    x = 0
    
    x1 = XGB1[i]
    x2 = XGB2[i]
    x1_round = np.rint(XGB1[i])
    x2_round = np.rint(XGB2[i])
    
    if(x2_round == 1 and x1_round == 0): x = 2
    elif(x2_round == 0 and x1_round == 1): x = 1
    elif (x2 >= x1): x = 2
    else: x = 1

    preds_1or2.append(x)

In [76]:
print len([x for x in preds_1or2 if x == 1])
print len([x for x in preds_1or2 if x == 2])

293
245


#### Predict 4 class in 4-5 classes 

In [77]:
dtest = xgb.DMatrix(test_4or5)
XGB4 = gbm4.predict(dtest)
indices = XGB4 < 0
XGB4[indices] = 0

#### Predict 5 class in 4-5 classes 

In [78]:
XGB5 = gbm5.predict(dtest)
indices = XGB5 < 0
XGB5[indices] = 0

In [79]:
print len([x for x in XGB4 if np.rint(x) == 1])
print len([x for x in XGB5 if np.rint(x) == 1])

330
213


In [80]:
preds_4or5 = []
for i in range(len(XGB5)):
    x = 0
    
    x1 = XGB4[i]
    x2 = XGB5[i]
    x1_round = np.rint(XGB4[i])
    x2_round = np.rint(XGB5[i])
    
    if(x2_round == 1 and x1_round == 0): x = 5
    elif(x2_round == 0 and x1_round == 1): x = 4
    elif (x2 >= x1): x = 5
    else: x = 4

    preds_4or5.append(x)

In [81]:
print len([x for x in preds_4or5 if x == 4])
print len([x for x in preds_4or5 if x == 5])

330
213


In [82]:
pd_preds[idx_1or2] = preds_1or2
pd_preds[idx_4or5] = preds_4or5

In [83]:
pd_preds.value_counts()

4    330
1    293
3    289
2    245
5    213
dtype: int64

#### Submit file 

In [90]:
test_sets = []
cur_list = []
for i in range(len(pd_preds)):
    cur_list.append(pd_preds[i])
    if((i+1) % 5 == 0):
        test_sets.append(cur_list[:])
        cur_list[:] = []

def to_str(l):
    return ' '.join([str(x) for x in l])

test_sets_str = [to_str(x) for x in test_sets]

submit = pd.read_csv('data/sample_submission.csv')
submit.day = test_sets_str
submit.to_csv('submission.csv', index=False)

In [92]:
c = 0
for s in test_sets:
    for x in range(1,6):
        c += abs(s.count(x)-1)
cnt_bad = 0
for s in test_sets:
    if (s.count(1) > 2 or s.count(2) > 2 or s.count(3) > 2 or s.count(4) > 2 or s.count(5) > 2): 
        cnt_bad += 1

cnt_good = 0
for s in test_sets:
    if (1 in s and 2 in s and 3 in s and 4 in s and 5 in s): 
        cnt_good += 1
print c, cnt_good, cnt_bad

338 112 0


In [71]:
neighbours = [3, 4, 5, 6, 7, 8, 9, 10]
weights = ['uniform', 'distance']
algorithms = ['ball_tree', 'kd_tree', 'brute']
mse_etal = [274, 274, 274, 274, 274]

print 'abs_num \t mse_5 \t cnt_good \t cnt_bad \t validation \t model'
for n in neighbours:
    for w in weights:
        for alg in algorithms:
            str_model = ' '.join([str(n), w, alg])
            knn = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=alg)
            cv = cross_val_score(knn, train_x, train_y_5, cv=3, scoring='accuracy').mean()
            
            knn.fit(train_x, train_y_5)
            preds_5 = knn.predict(test)
            
            test_sets = []
            cur_list = []
            for i in range(len(preds_5)):
                cur_list.append(preds_5[i])
                if((i+1) % 5 == 0):
                    test_sets.append(cur_list[:])
                    cur_list[:] = []

            c = 0
            for s in test_sets:
                for x in range(1,6):
                    c += abs(s.count(x)-1)
                    
            mse_cur = mean_squared_error(mse_etal, get_value_counts(preds_5, 5))
            
            cnt_bad = 0
            for s in test_sets:
                if (s.count(1) > 2 or s.count(2) > 2 or s.count(3) > 2 or s.count(4) > 2 or s.count(5) > 2): 
                    cnt_bad += 1

            cnt_good = 0
            for s in test_sets:
                if (1 in s and 2 in s and 3 in s and 4 in s and 5 in s): 
                    cnt_good += 1
                    
            print ('%f \t %f \t %f \t %f \t %f \t %s') % (c, mse_cur, cnt_good, cnt_bad, cv, str_model)

abs_num 	 mse_5 	 cnt_good 	 cnt_bad 	 validation 	 model
372.000000 	 1872.800000 	 101.000000 	 2.000000 	 0.905556 	 3 uniform ball_tree
372.000000 	 1872.800000 	 101.000000 	 2.000000 	 0.905556 	 3 uniform kd_tree
372.000000 	 1872.800000 	 101.000000 	 2.000000 	 0.905556 	 3 uniform brute
370.000000 	 1851.600000 	 101.000000 	 1.000000 	 0.908454 	 3 distance ball_tree
370.000000 	 1851.600000 	 101.000000 	 1.000000 	 0.908454 	 3 distance kd_tree
370.000000 	 1851.600000 	 101.000000 	 1.000000 	 0.908454 	 3 distance brute
370.000000 	 2299.200000 	 98.000000 	 4.000000 	 0.916667 	 4 uniform ball_tree
370.000000 	 2299.200000 	 98.000000 	 4.000000 	 0.916667 	 4 uniform kd_tree
370.000000 	 2299.200000 	 98.000000 	 4.000000 	 0.916667 	 4 uniform brute
366.000000 	 1816.400000 	 103.000000 	 4.000000 	 0.908333 	 4 distance ball_tree
366.000000 	 1816.400000 	 103.000000 	 4.000000 	 0.908333 	 4 distance kd_tree
366.000000 	 1816.400000 	 103.000000 	 4.000000 	 0.90833