In [31]:
import numpy as np
from collections import Counter
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from sklearn.metrics import f1_score

with open("train.dat", "r") as fh:
    data = fh.readlines()

records = [record.split() for record in data]

def filterLen(docs, minlen):
    return [ [t for t in d if len(t) >= minlen ] for d in docs ]

trainingOutput = [int(record[0]) for record in records]
trainingData = filterLen(records, 5)

classCnt = [0, 0, 0, 0, 0]
# imbalanced data check
for i in trainingOutput:
    if i == 1:
        classCnt[0] += 1
    elif i == 2:
        classCnt[1] += 1
    elif i == 3:
        classCnt[2] += 1
    elif i == 4:
        classCnt[3] += 1
    else:
        classCnt[4] += 1
print(classCnt)
weightedAvg = [len(trainingOutput)/5/cnt for cnt in classCnt]
print(weightedAvg)


[3163, 1494, 1925, 3051, 4805]
[0.9129307619348719, 1.9327978580990628, 1.500051948051948, 0.946443788921665, 0.6009573361082206]


In [32]:
idx = {}
nnz = 0
def buildDictionary(record):
    tid = 0
    global nnz
    for d in record:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    
def buildMatrix(record):
    global idx
    global nnz
    nrows = len(record)
    ncols = len(idx)
    # set up memory
    ind = np.zeros(nnz, dtype=np.int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows+1, dtype=np.int)
    i = 0  # document ID / row counter
    n = 0  # non-zero counter
    # transfer values
    for d in record:
        cnt = Counter(d)
        keys = list(k for k,_ in cnt.most_common())
        l = 0
        for j,k in enumerate(keys):
            if k in idx.keys():
                ind[j+n] = idx[k]
                val[j+n] = cnt[k]
                l += 1
        ptr[i+1] = ptr[i] + l
        n += l
        i += 1
            
    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    
    return mat

In [33]:
buildDictionary(trainingData)
trainingMat = buildMatrix(trainingData)
trainingMat = normalize(trainingMat, copy=True)

In [34]:
def knnPredict(testData, trainingData, trainingOutput, k, epsilon):
    testOutput = []
    output = np.dot(testData, trainingData.T)
    rows = output.shape[0]
    columns = output.shape[1]
    for i in range(rows):
        classWeight = [0.0, 0.0, 0.0, 0.0, 0.0]
        temp = output[i].todense()
        idx = temp.argsort(axis=1)
        idxList = [idx.item(columns-1-j) for j in range(k)]
        knn = [temp.item(x) for x in idxList if temp.item(x) >= epsilon]
        if (len(knn) == 0):
            knn = [temp.item(idxList[0])]
        for entry in range(len(knn)):
            classNum = int(trainingOutput[idxList[entry]])
            weightage = knn[entry]
            if classNum == 1:
                classWeight[0] += weightage
            elif classNum == 2:
                classWeight[1] += weightage
            elif classNum == 3:
                classWeight[2] += weightage
            elif classNum == 4:
                classWeight[3] += weightage
            elif classNum == 5:
                classWeight[4] += weightage
        classWeight = [classWeight[i]*weightedAvg[i] for i in range(len(classWeight))]
        classWeight = sorted(range(len(classWeight)), key=classWeight.__getitem__)
        testOutput.append(classWeight[-1]+1)
        
    return testOutput

In [35]:
from sklearn.metrics import precision_recall_fscore_support as score
with open("test.dat", "r") as fh:
    data = fh.readlines()

records = [record.split() for record in data]       
testData = filterLen(records, 5)
testMat = buildMatrix(testData)
testMat = normalize(testMat, copy=True)
k = 355
epsilon = 0.2
testOutput = knnPredict(testMat, trainingMat, trainingOutput, k, epsilon)
with open("output.dat", "w") as fh:
    for i in range(len(testOutput)):
        fh.write(str(testOutput[i]))
        fh.write('\n')
print('Done..')

'''
print(f1_score(trainingOutput, testOutput, average='weighted'))
precision, recall, fscore, support = score(trainingOutput, testOutput)

print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(fscore))
print('support: {}'.format(support))
'''

Done..


"\nprint(f1_score(trainingOutput, testOutput, average='weighted'))\nprecision, recall, fscore, support = score(trainingOutput, testOutput)\n\nprint('precision: {}'.format(precision))\nprint('recall: {}'.format(recall))\nprint('fscore: {}'.format(fscore))\nprint('support: {}'.format(support))\n"