# Data analytics and visualization for logistics.py

In [None]:
import sys
import random
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [None]:
try:
    # Get Dataset File
    # a.The path to a file containing a data set (e.g., monks1.csv)
    file_path = "occupancy.csv"

    # b. The learning rate 𝜂 to use during stochastic gradient descent
    learning_rate = float("0.01")

    #c. The percentage of instances to use for a training set
    training_set_percent = float("0.6")
    # Ensure training set percent is a valid percent that can be used
    if 0 > training_set_percent or training_set_percent > 1:
        print("Invalid percent. Please choose a value between 0 and 1")
        exit(1)

    #d. The percentage of instances to use for a validation set
    validation_set_percent = float("0.2")

    # Ensure validation set percent is a valid percent that can be used
    if 0 > validation_set_percent or validation_set_percent > 1:
        print("Invalid percent. Please choose a value between 0 and 1")
        exit(1)

    # Check that the values don't exceed 100%
    if training_set_percent + validation_set_percent == 1:
        print("Fair warning ... you don't have a testing set...\nPlease try again and leave room for a testing set :)")
        exit(1)
    elif training_set_percent + validation_set_percent > 1:
        print(f"The percentage of the training set plus the validation set is equal to: {training_set_percent + validation_set_percent}\nPlease only input values who's sum is less than 1")
        exit(1)

    # Store the size of the testing set
    testing_set_percent = 1 - training_set_percent - validation_set_percent

    #e. A random seed as an integer
    randomSeed = int("12345")

    # Print all input values given for user to see
    print(f"Inputs:\nFile: {file_path}\nLearning rate: {learning_rate}")
    print(f"Training Set Percent: {training_set_percent}\nValidation Set Percent: {validation_set_percent}\nTesting Set Percent: {testing_set_percent}")
    print(f"Random Seed: {randomSeed}\n")

    # Read in dataset
    df = pd.read_csv(file_path)

    # shuffle the dataframe. Use random seed from input and fraction 1 as we want the whole dataframe
    shuffled_df = df.sample(frac=1,random_state=randomSeed)

    print(f"Number of Instances in Dataframe: {len(df)}")

    # Applies the splits to the training set and validation set. The last argument wil what remained whihc is the testing set
    # Note, Numpy split does it where there are equal parts. First one says take the first <test_set_percent> amount of my dataframe
    # Second says, Take <training_set_percent + validation_set_percent> as training_set_percent is already taken so that leaves just the validation set amount
    # These points go by indices so that index of where to start the validation set is the sum of the two. The remaining amount is the left over argument that is the part of the dataframe not taken.
    # This results in that being the testing set
    splits_indices = [int(training_set_percent * len(df)), int((training_set_percent + validation_set_percent) * len(df))]
    print(f"Splits indexes they begin at: {splits_indices}\n")
    training_set, validation_set, testing_set = np.split(shuffled_df, splits_indices)

    print(f"Length of training: {len(training_set)}")
    print(f"Length of validiation set: {len(validation_set)}")
    print(f"Length of testing: {len(testing_set)}")

    # 1. Choose random values for all weights (often between -0.1 and 0.1)
    weights = np.random.uniform(-0.1, 0.1, training_set.shape[1])

except IndexError as e:
    print(f"Error. Message below:\n{e}\nPlease try again.")
    exit(1)
except ValueError as e:
    print(f"Error. Message below:\n{e}\nPlease try again.")
    exit(1)
except FileNotFoundError as e:
    print(f"Error. Message below:\n{e}\nPlease try again.")
    exit(1)

In [None]:
# Process to encode nominal columns or calculate the values for 
new_dataframe = pd.DataFrame()
for x in training_set.columns:
    # For Nominial Value
    if is_string_dtype(training_set[x]) and not is_numeric_dtype(training_set[x]):
        new_dataframe = pd.concat([new_dataframe, pd.get_dummies(training_set[x], prefix=x, prefix_sep='.')],axis=1)
        # training_set.drop([x],axis=1, inplace=True)
    else: 
        max_value = max(training_set[x])
        min_value = min(training_set[x])
        if max_value != 0 and min_value != 0:
            training_set[x] = training_set[x].apply(lambda x: (x - min_value)/(max_value - min_value))
        new_dataframe = pd.concat([new_dataframe, training_set[x]],axis=1)
        # training_set.drop([x],axis=1, inplace=True)

In [None]:
def net_calculate(weights, x_instance):
    # take the first value from weights as it is part of the net without a corresponding value in the instance
    net = weights[0]
    # Instances and weights are the same length
    for i in range(1, len(weights)):
        net += weights[i] * x_instance[i]
    return net

In [None]:
def sigmoid(net):
    # If x is a very large positive number, the sigmoid function will be close to 1
    if net >= 0:
        z = np.exp(-net)
        return 1 / (1 + z)
    # If x is a very large negative number, the sigmoid function will be close to 0
    else:
        z = np.exp(net)
        return z / (1 + z)

In [None]:
np_ts = training_set.to_numpy()
net = net_calculate(weights=weights, x_instance=np_ts[0])
out = sigmoid(net=net)
print(f"Net: {net}\nOut: {out}")
len(np_ts[0])

In [None]:
def stochastic_gradient_descent(training_set, validation_set, learning_rate):
    np_ts = training_set.to_numpy()
    np_vs = validation_set.to_numpy()
    # 1. Choose random values for all weights (often between -0.1 and 0.1)
    weights = np.random.uniform(-0.1, 0.1, training_set.shape[1])
    # 2. Unitl either the accuracy on the validation set > A% or we run n epochs
    # Set accuracy variable 
    accuracy = 0
    epochs = 0
    while accuracy <= 0.99 and epochs < 500:
        # A. For each instance x in the training set
        for insta_count in range(len(np_ts)):
            # Calculate Net Value between X instance and weights
            net = net_calculate(weights=weights, x_instance=np_ts[insta_count])
            # Calculate the out values from the net values calculated above 
            out_value = sigmoid(net=net)
            # I. Calculate gradient of w0
            grad_w0 = -1 * out_value * (1 - out_value) * (np_ts[insta_count][0] - out_value)
            # Update first weight in weights
            weights[0] -= learning_rate * grad_w0
            # II. Calculate gradient of wi
            for attr_count in range(1, len(np_ts[0])):
                grad_wi = -np_ts[insta_count][attr_count] * out_value * (1 - out_value) * (np_ts[insta_count][0] - out_value)
                weights[attr_count] -= learning_rate * grad_wi
            # print(f"Updated weights list: {weights}\n")
        epochs += 1
        tt = 0
        tf = 0 
        ft = 0
        ff = 0
        # Testing against validation set
        for insta_count in range(len(np_vs)):
            # Calculate Net Value between X instance and weights
            net = net_calculate(weights=weights, x_instance=np_vs[insta_count])
            # Calculate the out values from the net values calculated above 
            out_value = sigmoid(net=net)
            predict = 1 if out_value > 0.5 else 0
            # print(f"Predict Value:{predict}")
            if predict == 1  and np_vs[insta_count][0] == 1:
                tt += 1 
            elif predict == 1  and np_vs[insta_count][0] == 0:
                tf += 1 
            elif predict == 0  and np_vs[insta_count][0] == 1:
                ft += 1 
            else: 
                ff += 1
        accuracy = (tt + ff) / (tt + tf+ ft+ ff)
        print(f"Completed Epoch:{epochs}\nAccuracy: {accuracy}\nWeights: {weights}\n")
    return weights


In [8]:
stochastic_gradient_descent(training_set=training_set, validation_set=validation_set, learning_rate=learning_rate)

Completed Epoch:1
Accuracy: 0.7624027237354085
Weights: [-1.76696697 -0.36912332 -0.82155955 -0.17659908 -0.23855238 -0.57275872]

Completed Epoch:2
Accuracy: 0.7624027237354085
Weights: [-2.08868346 -0.45063984 -0.93596577 -0.18092816 -0.26433184 -0.65715762]

Completed Epoch:3
Accuracy: 0.7624027237354085
Weights: [-2.27277232 -0.49697436 -0.99871458 -0.1831482  -0.27828278 -0.70294248]

Completed Epoch:4
Accuracy: 0.7624027237354085
Weights: [-2.40191006 -0.52933385 -1.04163455 -0.1846188  -0.28774801 -0.73404044]

Completed Epoch:5
Accuracy: 0.7624027237354085
Weights: [-2.50137135 -0.55417404 -1.07410326 -0.18570918 -0.29486695 -0.75744412]

Completed Epoch:6
Accuracy: 0.7624027237354085
Weights: [-2.58224265 -0.57431766 -1.10013975 -0.18657118 -0.30054992 -0.7761341 ]

Completed Epoch:7
Accuracy: 0.7624027237354085
Weights: [-2.65037488 -0.59125058 -1.12182917 -0.18728153 -0.3052667  -0.79165024]

Completed Epoch:8
Accuracy: 0.7624027237354085
Weights: [-2.70923275 -0.60585079 -1

array([-4.50921179, -1.04043463, -1.64385954, -0.20301834, -0.41422036,
       -1.14925779])