# Data analytics and visualization for logistics.py

In [1]:
import sys
import random
import numpy as np
import pandas as pd
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [2]:
try:
    # Get Dataset File
    # a.The path to a file containing a data set (e.g., monks1.csv)
    file_path = "seismic.csv"

    # b. The learning rate 𝜂 to use during stochastic gradient descent
    learning_rate = float("0.01")

    #c. The percentage of instances to use for a training set
    training_set_percent = float("0.6")
    # Ensure training set percent is a valid percent that can be used
    if 0 > training_set_percent or training_set_percent > 1:
        print("Invalid percent. Please choose a value between 0 and 1")
        exit(1)

    #d. The percentage of instances to use for a validation set
    validation_set_percent = float("0.2")

    # Ensure validation set percent is a valid percent that can be used
    if 0 > validation_set_percent or validation_set_percent > 1:
        print("Invalid percent. Please choose a value between 0 and 1")
        exit(1)

    # Check that the values don't exceed 100%
    if training_set_percent + validation_set_percent == 1:
        print("Fair warning ... you don't have a testing set...\nPlease try again and leave room for a testing set :)")
        exit(1)
    elif training_set_percent + validation_set_percent > 1:
        print(f"The percentage of the training set plus the validation set is equal to: {training_set_percent + validation_set_percent}\nPlease only input values who's sum is less than 1")
        exit(1)

    # Store the size of the testing set
    testing_set_percent = 1 - training_set_percent - validation_set_percent

    #e. A random seed as an integer
    randomSeed = int("12")

    # Print all input values given for user to see
    print(f"Inputs:\nFile: {file_path}\nLearning rate: {learning_rate}")
    print(f"Training Set Percent: {training_set_percent}\nValidation Set Percent: {validation_set_percent}\nTesting Set Percent: {testing_set_percent}")
    print(f"Random Seed: {randomSeed}\n")

    # Read in dataset
    df = pd.read_csv(file_path)

    # shuffle the dataframe. Use random seed from input and fraction 1 as we want the whole dataframe
    shuffled_df = df.sample(frac=1,random_state=randomSeed)

    print(f"Number of Instances in Dataframe: {len(df)}")

    # Applies the splits to the training set and validation set. The last argument wil what remained whihc is the testing set
    # Note, Numpy split does it where there are equal parts. First one says take the first <test_set_percent> amount of my dataframe
    # Second says, Take <training_set_percent + validation_set_percent> as training_set_percent is already taken so that leaves just the validation set amount
    # These points go by indices so that index of where to start the validation set is the sum of the two. The remaining amount is the left over argument that is the part of the dataframe not taken.
    # This results in that being the testing set
    splits_indices = [int(training_set_percent * len(df)), int((training_set_percent + validation_set_percent) * len(df))]
    print(f"Splits indexes they begin at: {splits_indices}\n")
    training_set, validation_set, testing_set = np.split(shuffled_df, splits_indices)

    print(f"Length of training: {len(training_set)}")
    print(f"Length of validiation set: {len(validation_set)}")
    print(f"Length of testing: {len(testing_set)}")


except IndexError as e:
    print(f"Error. Message below:\n{e}\nPlease try again.")
    exit(1)
except ValueError as e:
    print(f"Error. Message below:\n{e}\nPlease try again.")
    exit(1)
except FileNotFoundError as e:
    print(f"Error. Message below:\n{e}\nPlease try again.")
    exit(1)

Inputs:
File: seismic.csv
Learning rate: 0.01
Training Set Percent: 0.6
Validation Set Percent: 0.2
Testing Set Percent: 0.2
Random Seed: 12

Number of Instances in Dataframe: 2584
Splits indexes they begin at: [1550, 2067]

Length of training: 1550
Length of validiation set: 517
Length of testing: 517


In [3]:
print(f"Column {df.columns[0]} numerical type? {is_numeric_dtype(df[df.columns[0]])}")
print(f"Column {df.columns[2]} numerical type? {is_numeric_dtype(df[df.columns[2]])}")
print(f"Column {df.columns[0]} string type? {is_string_dtype(df[df.columns[0]])}")
print(f"Column {df.columns[1]} string type? {is_string_dtype(df[df.columns[1]])}")

Column label numerical type? True
Column seismoacoustic numerical type? False
Column label string type? False
Column seismic string type? True


In [4]:
for x in df.columns:
    # For Nominial Value
    if is_string_dtype(df[x]) and not is_numeric_dtype(df[x]):
        print(f"Column '{x}' is a String type")
    # For Numerical Value
    elif not is_string_dtype(df[x]) and is_numeric_dtype(df[x]):
        print(f"Column '{x}' is a Numerical type")

Column 'label' is a Numerical type
Column 'seismic' is a String type
Column 'seismoacoustic' is a String type
Column 'shift' is a String type
Column 'genergy' is a Numerical type
Column 'gpuls' is a Numerical type
Column 'gdenergy' is a Numerical type
Column 'gdpuls' is a Numerical type
Column 'ghazard' is a String type
Column 'nbumps' is a Numerical type
Column 'nbumps2' is a Numerical type
Column 'nbumps3' is a Numerical type
Column 'nbumps4' is a Numerical type
Column 'nbumps5' is a Numerical type
Column 'nbumps6' is a Numerical type
Column 'nbumps7' is a Numerical type
Column 'nbumps89' is a Numerical type
Column 'energy' is a Numerical type
Column 'maxenergy' is a Numerical type


In [5]:
print(training_set.columns)

Index(['label', 'seismic', 'seismoacoustic', 'shift', 'genergy', 'gpuls',
       'gdenergy', 'gdpuls', 'ghazard', 'nbumps', 'nbumps2', 'nbumps3',
       'nbumps4', 'nbumps5', 'nbumps6', 'nbumps7', 'nbumps89', 'energy',
       'maxenergy'],
      dtype='object')


In [6]:
# Process to encode nominal columns or calculate the values for 
new_dataframe = pd.DataFrame()
for x in training_set.columns:
    # For Nominial Value
    if is_string_dtype(training_set[x]) and not is_numeric_dtype(training_set[x]):
        new_dataframe = pd.concat([new_dataframe, pd.get_dummies(training_set[x], prefix=x, prefix_sep='.')],axis=1)
        # training_set.drop([x],axis=1, inplace=True)
    else: 
        max_value = max(training_set[x])
        min_value = min(training_set[x])
        if max_value != 0 and min_value != 0:
            training_set[x] = training_set[x].apply(lambda x: (x - min_value)/(max_value - min_value))
        new_dataframe = pd.concat([new_dataframe, training_set[x]],axis=1)
        # training_set.drop([x],axis=1, inplace=True)

In [7]:
print(new_dataframe.columns)
training_set= new_dataframe
training_set

Index(['label', 'seismic.a', 'seismic.b', 'seismoacoustic.a',
       'seismoacoustic.b', 'seismoacoustic.c', 'shift.N', 'shift.W', 'genergy',
       'gpuls', 'gdenergy', 'gdpuls', 'ghazard.a', 'ghazard.b', 'ghazard.c',
       'nbumps', 'nbumps2', 'nbumps3', 'nbumps4', 'nbumps5', 'nbumps6',
       'nbumps7', 'nbumps89', 'energy', 'maxenergy'],
      dtype='object')


Unnamed: 0,label,seismic.a,seismic.b,seismoacoustic.a,seismoacoustic.b,seismoacoustic.c,shift.N,shift.W,genergy,gpuls,...,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy
613,1,0,1,1,0,0,0,1,0.005068,0.025174,...,4,1,2,1,0,0,0,0,25200.0,20000.0
2185,0,1,0,0,0,1,1,0,0.011311,0.134845,...,2,1,1,0,0,0,0,0,2500.0,2000.0
863,0,1,0,1,0,0,0,1,0.055521,0.052094,...,1,1,0,0,0,0,0,0,500.0,500.0
2137,0,1,0,1,0,0,0,1,0.006334,0.084247,...,1,1,0,0,0,0,0,0,600.0,600.0
1660,0,0,1,0,1,0,0,1,0.008738,0.213609,...,1,1,0,0,0,0,0,0,200.0,200.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
800,0,1,0,1,0,0,0,1,0.008797,0.069043,...,4,3,1,0,0,0,0,0,2800.0,1000.0
1973,0,0,1,1,0,0,0,1,0.025481,0.149053,...,0,0,0,0,0,0,0,0,0.0,0.0
866,0,1,0,1,0,0,0,1,0.047065,0.051346,...,0,0,0,0,0,0,0,0,0.0,0.0
2415,0,0,1,0,1,0,0,1,0.010819,0.066550,...,1,1,0,0,0,0,0,0,200.0,200.0


In [8]:
training_set.head(1)

Unnamed: 0,label,seismic.a,seismic.b,seismoacoustic.a,seismoacoustic.b,seismoacoustic.c,shift.N,shift.W,genergy,gpuls,...,nbumps,nbumps2,nbumps3,nbumps4,nbumps5,nbumps6,nbumps7,nbumps89,energy,maxenergy
613,1,0,1,1,0,0,0,1,0.005068,0.025174,...,4,1,2,1,0,0,0,0,25200.0,20000.0


In [9]:
training_set.columns

Index(['label', 'seismic.a', 'seismic.b', 'seismoacoustic.a',
       'seismoacoustic.b', 'seismoacoustic.c', 'shift.N', 'shift.W', 'genergy',
       'gpuls', 'gdenergy', 'gdpuls', 'ghazard.a', 'ghazard.b', 'ghazard.c',
       'nbumps', 'nbumps2', 'nbumps3', 'nbumps4', 'nbumps5', 'nbumps6',
       'nbumps7', 'nbumps89', 'energy', 'maxenergy'],
      dtype='object')