In [1]:
import os
import numpy as np
import random
import math

In [2]:
def load_file(file_path):
    """
    list the txt files in dataset
    
    parameter: file path, str
    
    return: list
    """
#     path = './dataset 1/train/spam'
    all_file = os.listdir(file_path) 
   
    for i in range(len(all_file)):
        all_file[i] = file_path + all_file[i]
    return all_file

In [3]:
def gen_dict_from_file(file_path):
    '''
    Build directory
    parameter: file_path
    return: directory of each file 
    '''
    result = {"?? ??????????????": 1}
    file = open(file_path, 'r', errors='ignore')
    
    for line in file:
        word_list = line.rstrip('\n').lower().split()
        for word in word_list:
            if word not in result:
                result[word] = 1
            else:
                result[word] += 1
    
    return result
    

In [4]:
def initialize(dict_list):
    '''
    parameter: 
        dict_list: a list of dictionary

    establish a list of words and weight

    result: list of words and weight
    '''
    W = {"?? ??????????????": 0}
    for sub_dict in dict_list:
        for word in sub_dict[0]:
            W[word] = 0      
    return W

In [5]:
def sigmoid(x, w):
    """
    Param:
        x: dictionary
        w: weight
    """
    z = 0
    
    for word in x:
        if word in w:
            z += x[word] * w[word]
    if z > 10:
        z = 10
    if z < -10:
        z = -10
    
    return 1 / (1 + np.exp(z))
    

In [6]:
def predict(x, w):
    '''
    input: 
    
    output:
    
    '''
    predicted_label = sigmoid(x, w)
    if predicted_label > 0.5:
        return 0
    else:
        return 1
    

In [7]:
def test(dict_list, weight):
    '''
    parameter: 
        dict_list: list of dictionaries from each dataset in testset
        weight: train   
    '''
    correct = 0
    for element in dict_list:
        x, label = element[0], element[1]
        predict_label= predict(x, weight)
        if predict_label == label:
            correct += 1
            
    accuracy = correct / len(dict_list)
    
    return accuracy
    

In [8]:
def train(dict_list, weight, iteration, eta, regular):
    """
    Param:
        dict_list: (x, label), list of dictionaries from each dataset
        weight:
        spam_ham: label of current dict_list
    Return:
    """
    for t in range(iteration):
        dw = {word: 0 for word in weight}
#         lw = 0
        for element in dict_list:
            x, label = element[0], element[1]
            p = sigmoid(x, weight)
#             lw += label * np.log(p) + (1-label) * np.log(1-p)
            for word in weight:
                if word in x:
                    dw[word] += x[word] * (label - (1-p))
        for i in dw:
            weight[i] += eta * dw[i] - regular * weight[i] * eta
        
#         print(f'\r{t/iteration}%', end='')
    return weight

In [9]:
def develop_data(data, a): 
    ham_train_path = f'{data}/{a}/ham/'
    spam_train_path = f'{data}/{a}/spam/'
    
#     print(ham_train_path)
#     print(spam_train_path)
    
    ham_file_train = load_file(ham_train_path)
    spam_file_train = load_file(spam_train_path)
    
    print(f'{data}/{a}/ham/: {len(ham_file_train)}')
    print(f'{data}/{a}/spam/: {len(spam_file_train)}')

    ham_gen_dict = []
    for file in ham_file_train:
        ham_gen_dict.append(gen_dict_from_file(file))
        
    spam_gen_dict = []
    for file in spam_file_train:
        spam_gen_dict.append(gen_dict_from_file(file))
    
    total_dict_train = [(x, 1) for x in ham_gen_dict] + [(x, 0) for x in spam_gen_dict]
    
    return total_dict_train

In [None]:
dataset = ['dataset 1', 'dataset 2', 'dataset 3']

# train_data = develop_data('./datasets/dataset 1/', 'train')
for data in dataset:
    print(data)
    
    train_data = develop_data('./datasets/'+data, 'train')
    
    random.shuffle(train_data)
    
    weight = initialize(train_data)
    
#     print(f'weight size: {len(weight)}')
    
    split_point = math.floor(len(train_data) * 0.7) 
    
#     panalize_param = [0.1, 1, 3, 5, 10, 20, 30, 50, 100]
    panalize_param = [0.01, 0, 10]
    
    weight_candidate = [initialize(train_data) for _ in range(len(panalize_param))]
    
# #     print(len(weight_candidate))
# #     print(weight_candidate[0])

    best_acc = 0
    best_panalize_param = 0
    iterations = 400
    eta = 1e-4
    
    for i in range(len(panalize_param)):
        
        weight_candidate[i] = train(train_data[:split_point],
                                    weight_candidate[i], 
                                    iterations, 
                                    eta, 
                                    panalize_param[i])
    
        # Validate
        accuracy = test(train_data[split_point:], weight_candidate[i])
        
        print(f'acc: {accuracy}, lambda: {panalize_param[i]}')
        if accuracy > best_acc:
            best_acc = accuracy
            best_panalize_param = panalize_param[i]
    
    print(f'best lambda: {best_panalize_param}')
    
    weight_final = train(train_data, initialize(train_data), iterations, eta, best_panalize_param)
    
    test_data = develop_data('./datasets/'+data, 'test')
    accuracy_final = test(test_data, weight_final)
    
    print(
        f'-----------------------\n'
        f'Acc on test: {accuracy_final}\n'
        f'\n#######################\n'
    )
    
    

dataset 1
./datasets/dataset 1/train/ham/: 340
./datasets/dataset 1/train/spam/: 123
acc: 0.8920863309352518, lambda: 0.01
