There are 4 input files:
- training_data.csv: 1st column is the name of the app, the remaining columns are the tf-idf value 
- training_desc.csv: 1st column is the name of the app, the 2nd columns are the description of the app
- training labels.csv: The first column is the app’s name and the second column is for the label
- test_data.csv: 1st column is the name of the app, the remaining columns are the tf-idf value

There is 1 output file:
- “predicted_labels.csv” in the same format as training_labels





#1. Import the necessary database and set the path file


In [1]:
import os
import time
#Please fill in the input folder path and the output folder path
input_path = '/Users/ppha/Desktop/UniSyd/Semester3/MachineLearning/Assignment1/assignment1_2017S1'
output_path = '/Users/ppha/Desktop/UniSyd/Semester3/MachineLearning/Assignment1/460110536_460114051_460232702/Code/Output'
os.chdir(input_path)
import pandas as pd
import numpy as np
import matplotlib.pylab as pl
%matplotlib inline
from collections import OrderedDict

In [2]:
path_train_data = 'training_data.csv'
path_label_data= 'training_labels.csv'
path_test_data =  'test_data.csv'


#2. Functions to read in the file data

In [3]:
def readin_data(line):      
    data = line.strip().split(',')
    n = len(data)
    name_app = str(data[0])
    value = data[1:n]
    value = np.array(value)
    #Convert to only 0 and 1 value 
    b_value = (value !='0')
    b_value = b_value.astype(np.int)
    return name_app, b_value
        
def readin_labels(line):
    data = line.strip().split(',')
    n = len(data)
    name_app = str(data[0])
    value = str(data[1:n])
    return name_app, value

#3. Read in the training data as train:dict and a training label as label:dict

In [4]:
labels= OrderedDict()
train_labels = open(path_label_data,'r')
for line in train_labels.readlines():
    name_app, category = readin_labels(line)
    labels[name_app] = category

In [5]:
#This execution may take 1 minute
train = OrderedDict()
train_data = open(path_train_data,'r')
start_time = time.time()
for line in train_data.readlines():
    name_app, tfidf = readin_data(line)
    train[name_app] = tfidf
print("--- %s seconds ---" % (time.time() - start_time))

--- 54.040756940841675 seconds ---


#3. Map two dictionaries and convert it to dataframe for better visualization at each step

In [6]:
#map betweeen two dictionary( so we have key:name_app and value: tfidf, label
output = {k:[train.get(k),labels.get(k)] for k in train.keys() | labels.keys()}
#convert from dict to dataframe
df = pd.DataFrame.from_dict(output, orient ='index')


#4. Perform 10-fold validation

In [7]:
def cross_validation(df, k =10, n=0):
    temp = np.array_split(df,k)
    test_set= pd.DataFrame()
    train_set = list()
    for i in range(k):
        if i == n: 
            test_set =temp[i]
        else:
            train_set.append(temp[i])
    train_set = pd.concat(train_set)
    return test_set, train_set

#5. Calculate Multinomial Naive Bayes

In [8]:
def arrange_data(train_df):
    #Group each item in the same category together
    new_df = train_df.groupby(1,sort=True )[0].apply(list) 
    new_df = new_df.reset_index()
    return new_df

def cal_prob(new_df,test,alpha = 1):
    #Calculate the number of items in the train_df
    count_size = len(new_df) 
    
    X = test[0]
    temp = list()
    for row in new_df[0]:
        
        # Calculate the probability of each class
        class_prob = np.log(len(row)/count_size)
        
        #Calculate the number of items in each class
        num_items = len(row)
        
        #Calculate the number of each tfidf that is NOT zero in each line + alpha for smoothing 
        u = np.array(row).T
        count = [(sum(1 for x in i if (x ==1))+alpha) for i in u]
        count_attr = np.array(count)
        
        #Calculate the probability of each tfidf   
        attr_prob = np.log(count_attr/num_items) 
    
        prob = [(class_prob + attr_prob.dot(j)) for j in X]
        temp.append(prob)
        
    temp = np.array(temp)
    temp = pd.DataFrame(temp)
    Category = temp.apply(lambda x: x.argmax(), axis =0)
    Category = Category.map(new_df[1])
    return Category

In [9]:
def predict(train_set,test_set):
    new_df = arrange_data(train_set)
    cate_prob = cal_prob(new_df,test_set,alpha =1)
    return cate_prob


#6. Calculate accuracy, only do this when you want to check the accuracy

In [10]:
def accuracy(test_set,cate_prob):
    count = 0
    for i in range(0,len(cate_prob)):
        if test_set[1][i] == cate_prob[i]:
            count = count +1
    a = count / len(test_set)
    return a

In [17]:
'''
#This task might take 10 minutes to compute the accuracy for the whole set
#Get the trainset and test set
for i in range(0,10):
    n = i
    test_set = pd.DataFrame()
    train_set = pd.DataFrame()
    test_set,train_set = cross_validation(df,k=10,n=n)
    start_time = time.time()
    cate_prob = predict(train_set,test_set)
    acc = accuracy(test_set,cate_prob)
    print(i,'Accuracy is: ', acc)
    print(i,"Processing time is %s seconds ---" % (time.time() - start_time))

'''

'\n#This task might take 10 minutes to compute the accuracy for the whole set\n#Get the trainset and test set\nfor i in range(0,10):\n    n = i\n    test_set = pd.DataFrame()\n    train_set = pd.DataFrame()\n    test_set,train_set = cross_validation(df,k=10,n=n)\n    start_time = time.time()\n    cate_prob = predict(train_set,test_set)\n    acc = accuracy(test_set,cate_prob)\n    print(i,\'Accuracy is: \', acc)\n    print(i,"Processing time is %s seconds ---" % (time.time() - start_time))\n\n'

#7. Make prediction for the test_set

In [12]:
test_file = open(path_test_data,'r')
test_dict = OrderedDict()
for line in test_file.readlines():
    name_app, tfidf = readin_data(line)
    test_dict[name_app] = tfidf


In [13]:
for i, k in  enumerate(test_dict.keys()):
    u = list()
    u.append(test_dict[k])
    u.append(i)
    test_dict[k] = u
    

In [14]:
test_df = pd.DataFrame.from_dict(test_dict,orient ='index')

In [15]:
train_data = df
test_data = test_df
start_time = time.time()
prediction = predict(train_data,test_data)

print("--- %s seconds ---" % (time.time() - start_time))

--- 92.44650602340698 seconds ---


In [16]:
output_file = os.path.join(output_path, 'predicted_labels.csv')         

output_test = open(output_file, 'w')
for i,k in enumerate(test_dict.keys()):
    name = k
    pred = prediction[i][2:(len(prediction[i])-2)]
    output_test.write(name+','+pred+'\n')
output_test.close()


