In [1]:
import random
import math
import numpy as np
import os

In [2]:
def load_file(file_path):
    """
    list the txt files in dataset
    
    parameter: file path, str
    
    return: list
    """
#     path = './dataset 1/train/spam'
    all_file = os.listdir(file_path) 
   
    for i in range(len(all_file)):
        all_file[i] = file_path + all_file[i]
    return all_file

# a = load_file('./datasets/dataset 1/train/ham')
# print(len(a))

In [3]:
def gen_dict_from_file(file_path):
    '''
    Build directory
    parameter: file_path
    return: directory of each file 
    '''
    result = {"????????????????": 1}
    file = open(file_path, 'r', errors='ignore')
    
    for line in file:
        word_list = line.rstrip('\n').lower().split()
        for word in word_list:
            if word not in result:
                result[word] = 1
            else:
                result[word] += 1
    
    return result

In [4]:
def initialize(dict_list):
    '''
    parameter: 
        dict_list: a list of dictionary

    establish a list of words and weight

    result: list of words and weight
    '''
    W = {"????????????????": 0}
    for sub_dict in dict_list:
        for word in sub_dict[0]:
                W[word] = 0      
    return W

In [5]:
def predict(x, w):
    """
    Use sign function to predict label
    """
    z = 0
    
    for word in x:
        if word in w:
            z += x[word] * w[word]
            
    if z > 0:
        return 1
    else:
        return -1

In [6]:
def test(dict_list, weight):
    '''
    parameter: 
        dict_list: list of dictionaries from each dataset in testset
        weight: train   
    '''
    correct = 0
    for element in dict_list:
        x, label = element[0], element[1]
        predict_label= predict(x, weight)
        if predict_label == label:
            correct += 1
            
    accuracy = correct / len(dict_list)
    
    return accuracy

In [7]:
def train(dict_list, weight, iteration, eta):
    """
    Param:
        dict_list: (x, label), list of dictionaries from each dataset
        weight:
        spam_ham: label of current dict_list
    Return:
    """
    for t in range(iteration):
        for element in dict_list:
            x, label = element[0], element[1]
            p = predict(x, weight)
            for word in x:
                weight[word] += eta * (label - p) * x[word]
                
    return weight

In [8]:
def develop_data(data, a): 
    ham_train_path = f'{data}/{a}/ham/'
    spam_train_path = f'{data}/{a}/spam/'
    
    ham_file_train = load_file(ham_train_path)
    spam_file_train = load_file(spam_train_path)
    
    print(f'{data}/{a}/ham/: {len(ham_file_train)}')
    print(f'{data}/{a}/spam/: {len(spam_file_train)}')

    ham_gen_dict = []
    for file in ham_file_train:
        ham_gen_dict.append(gen_dict_from_file(file))
        
    spam_gen_dict = []
    for file in spam_file_train:
        spam_gen_dict.append(gen_dict_from_file(file))
    
    total_dict_train = [(x, 1) for x in ham_gen_dict] + [(x, -1) for x in spam_gen_dict]
    
    return total_dict_train

In [9]:
dataset = ['dataset 1', 'dataset 2', 'dataset 3']

for data in dataset:
    print(data)
    
    train_data = develop_data('./datasets/'+data, 'train')
    
    random.shuffle(train_data)
    
#     weight = initialize(train_data)
    
#     print(f'weight size: {len(weight)}')
    
    split_point = math.floor(len(train_data) * 0.7) 
    
#     panalize_param = [0.1, 1, 3, 5, 10, 20, 30, 50, 100]
    iterations = [1, 10, 30, 50]
    
    weight_candidate = [initialize(train_data) for _ in range(len(iterations))]

    best_acc = 0
    best_iterations = 0
    eta = 1e-4
    
    for i in range(len(iterations)):
        
        weight_candidate[i] = train(train_data[:split_point], weight_candidate[i], iterations[i], eta)
    
        # Validate
        accuracy = test(train_data[split_point:], weight_candidate[i])
        
        print(f'acc: {accuracy}, # iter: {iterations[i]}')
        if accuracy > best_acc:
            best_acc = accuracy
            best_iterations = iterations[i]
    
    print(f'best iter: {best_iterations}')
    
    weight_final = train(train_data, initialize(train_data), best_iterations, eta)
#     weight_final = train(train_data, initialize(train_data), 30, 1e-4)
    
    test_data = develop_data('./datasets/'+data, 'test')
    
    accuracy_final = test(test_data, weight_final)
    
    print(
        f'-----------------------\n'
        f'Acc on test: {accuracy_final}\n'
        f'\n#######################\n'
    )

dataset 1
./datasets/dataset 1/train/ham/: 340
./datasets/dataset 1/train/spam/: 123
acc: 0.7553956834532374, lambda: 1
acc: 0.9136690647482014, lambda: 10
acc: 0.9136690647482014, lambda: 30
acc: 0.9136690647482014, lambda: 50
best iter: 10
./datasets/dataset 1/test/ham/: 348
./datasets/dataset 1/test/spam/: 130
-----------------------
Acc on test: 0.8807531380753139

#######################

dataset 2
./datasets/dataset 2/train/ham/: 319
./datasets/dataset 2/train/spam/: 131
acc: 0.837037037037037, lambda: 1
acc: 0.8962962962962963, lambda: 10
acc: 0.9185185185185185, lambda: 30
acc: 0.9185185185185185, lambda: 50
best iter: 30
./datasets/dataset 2/test/ham/: 307
./datasets/dataset 2/test/spam/: 149
-----------------------
Acc on test: 0.9035087719298246

#######################

dataset 3
./datasets/dataset 3/train/ham/: 133
./datasets/dataset 3/train/spam/: 402
acc: 0.7018633540372671, lambda: 1
acc: 0.906832298136646, lambda: 10
acc: 0.8944099378881988, lambda: 30
acc: 0.894409937

In [13]:
for i in range(100):
    for _ in range(10000000):
        pass
    print('\r',i, end="")

 99

In [20]:
a = 10.123142345
print(f'{a:.2f}')

10.12


In [21]:
print(type(1e-4))

<class 'float'>


In [27]:
for i in [-3, -2, -1, 1, 2, 3]:
    print(float('1e'+str(i)))
    print(f'{}')

0.001
0.01
0.1
10.0
100.0
1000.0
