In [76]:
import utils
import predict
import time as tm
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.datasets import make_classification
import scipy
from sklearn.model_selection import train_test_split

In [77]:
dictSize = 225
(X, y) = utils.loadData( "train", dictSize = dictSize )

In [78]:
X = scipy.sparse.csr_matrix.toarray(X)

In [79]:
# Decision Tree 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
clf_DT = DTC(random_state=0)
op_DT = clf_DT.fit(X_train,y_train)

In [80]:
print (np.size(X_test)/dictSize)
print (y_test)

3300.0
[3. 2. 3. ... 9. 4. 3.]


In [88]:
op_Pred_DT = op_DT.predict_proba(X_test)
# top k predictions
k = 5
top_n_pred = np.argsort(-op_Pred_DT,axis=1)[:,:k]
class_labels_DT= clf_DT.classes_
yPred_DT = class_labels_DT[top_n_pred]

In [89]:
# Random Forest
clf_RFC = RFC()
op_RFC = clf_RFC.fit(X_train,y_train)

In [90]:
op_Pred_RFC = op_RFC.predict_proba(X_test)
# top k predictions
top_k_pred_RFC = np.argsort(-op_Pred_RFC,axis=1)[:,:k]
class_labels_RFC= clf_RFC.classes_
yPred_RFC = class_labels_RFC[top_k_pred_RFC]

In [91]:
# eval functions for DT
preck = utils.getPrecAtK( y_test, yPred_DT, k )
# The macro precision code takes a bit longer to execute due to the for loop over labels
mpreck = utils.getMPrecAtK( y_test, yPred_DT, k )

# According to our definitions, both prec@k and mprec@k should go up as k goes up i.e. for your
# method, prec@i > prec@j if i > j and mprec@i > mprec@j if i > j. See the assignment description
# to convince yourself why this must be the case.

print( "prec@1: %0.3f" % preck[0], "prec@3: %0.3f" % preck[2], "prec@5: %0.3f" % preck[4] )
# Dont be surprised if mprec is small -- it is hard to do well on rare error classes
print( "mprec@1: %0.3e" % mpreck[0], "mprec@3: %0.3e" % mpreck[2], "mprec@5: %0.3e" % mpreck[4] )
print ("prec matrix",preck)
print ("mprec matrix",mpreck)

prec@1: 0.740 prec@3: 0.804 prec@5: 0.805
mprec@1: 5.811e-03 mprec@3: 4.774e-02 mprec@5: 9.941e-02
prec matrix [0.74030303 0.79939394 0.80393939 0.80393939 0.80545455]
mprec matrix [0.00581107 0.02643503 0.04773651 0.07787835 0.09940824]


In [92]:
# eval functions for Random Forest
preck = utils.getPrecAtK( y_test, yPred_RFC, k )
# The macro precision code takes a bit longer to execute due to the for loop over labels
mpreck = utils.getMPrecAtK( y_test, yPred_RFC, k )

# According to our definitions, both prec@k and mprec@k should go up as k goes up i.e. for your
# method, prec@i > prec@j if i > j and mprec@i > mprec@j if i > j. See the assignment description
# to convince yourself why this must be the case.

print( "prec@1: %0.3f" % preck[0], "prec@3: %0.3f" % preck[2], "prec@5: %0.3f" % preck[4] )
# Dont be surprised if mprec is small -- it is hard to do well on rare error classes
print( "mprec@1: %0.3e" % mpreck[0], "mprec@3: %0.3e" % mpreck[2], "mprec@5: %0.3e" % mpreck[4] )
print ("prec matrix",preck)
print ("mprec matrix",mpreck)

prec@1: 0.772 prec@3: 0.918 prec@5: 0.949
mprec@1: 5.695e-03 mprec@3: 4.508e-02 mprec@5: 1.017e-01
prec matrix [0.77181818 0.87575758 0.91818182 0.93757576 0.94878788]
mprec matrix [0.00569501 0.03093158 0.04507819 0.08312165 0.1017259 ]


In [93]:
print (y_test)
op_DT.predict(X_test)

[3. 2. 3. ... 9. 4. 3.]


array([ 3.,  2.,  3., ...,  9.,  4., 12.])

In [94]:
print(yPred_DT)

[[ 3. 22.  1. 27. 28.]
 [ 2.  1. 27. 28. 29.]
 [ 3.  1. 27. 28. 29.]
 ...
 [ 9.  1. 27. 28. 29.]
 [ 4.  1. 27. 28. 29.]
 [12. 27. 28. 29. 30.]]
