In [25]:
from statistics import fmean, pstdev
import numpy as np
rng = np.random.default_rng()
from tqdm import tqdm, trange

In [26]:
def normalization_factors(xss):
    return [(fmean(xs), pstdev(xs)) for xs in zip(*xss)]

def normalized_input(xss, stats):
    return [[(x - m) / s if s > 0 else 0 for x, (m, s) in zip(xs, stats)] for xs in xss]

In [27]:
# read file with multiple attributes

def read_input_file(filename):
    x = []
    y = []
    
    with open(filename) as f:
        lines = f.readlines()
        for line in lines[1:]:
            line = [float(i.strip()) for i in line.split(',')]
            x.append(line[:-1])
            y.append(line[-1])
            
    # normalize input to have mean 0 and standard deviation 1
    stats = normalization_factors(x)
    x = normalized_input(x, stats)

    return np.array(x), np.array(y), stats

In [28]:
dataset = 'demo'

In [29]:
# attributes and output
filename = f'datasets/{dataset}/train.txt'

x, y, stats = read_input_file(filename)

In [30]:
tqdm._instances.clear()

In [31]:
# build model
# gradient descent for loss function

def linear_regression(x, y):
    w = rng.uniform(-1, 1, x.shape[1])
    b = rng.uniform(-1, 1)
    alpha = 0.01
    n = x.shape[0]
    
    epochs = 100000 // n
    for e in trange(epochs):
        y_pred = w @ x.T + b
        partial_w = x.T @ (y_pred - y) / n
        partial_b = np.sum(y_pred - y) / n
        w -= alpha * partial_w
        b -= alpha * partial_b
 
    return w, b

In [32]:
# weights and bias
w, b = linear_regression(x, y)

100%|████████████████████████████████████████████████████████| 5000/5000 [00:00<00:00, 10304.74it/s]


In [33]:
# read test file

def read_test_file(test_file, stats):
    t = []
    
    with open(test_file) as f:
        lines = f.readlines()
        for line in lines[1:]:
            line = [float(i.strip()) for i in line.split(',')]
            t.append(line)
            
    return np.array(normalized_input(t, stats))

In [34]:
filename = f'datasets/{dataset}/test.txt'
t = read_test_file(filename, stats)

In [35]:
res = [str(i) for i in np.dot(t, w) + b]

In [36]:
# use test file and write predicted values to new file
with open(f'datasets/{dataset}/out.txt', 'w') as f:
    print('\n'.join(res))
    f.write('\n'.join(res))

0.5256746948201834
1.4259716265258837
2.5513427911580093
3.0014912570108594
3.676713955790135
0.7507489277466086
1.2008973935994585
2.3262685582315843
2.7764170240844344
3.9017881887165604
