# V.1: Exploring the green reds

a) plot a scatterplot matrix

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import itertools
import random
%matplotlib inline

In [None]:
data = pd.read_csv("input/winequality-red.csv")
data.shape

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.shape

In [None]:
def plot_scatter_matrix(data, good_threshold, bad_threshold, save_plot=False, quality=True):
    
    names = data.columns
    good_data = data.loc[data['quality'] >= good_threshold]
    bad_data = data.loc[data['quality'] <= bad_threshold]

    if quality==False:
        good_data = good_data.drop('quality', 1)
        bad_data = bad_data.drop('quality', 1)
        names = good_data.columns

    
    numvars, numdata = good_data.shape
    fig, axes = plt.subplots(nrows=numdata, ncols=numdata, figsize=(20,20))
    fig.subplots_adjust(hspace=0.05, wspace=0.05)
    
    for ax in axes.flat:
        # Hide all ticks and labels
        ax.xaxis.set_visible(False)
        ax.yaxis.set_visible(False)

    # Plot the data.
    for i in range(0, numdata):
        for j in range(0, numdata):
            if (i != j):
                for x, y in [(i,j), (j,i)]:
                    #iloc allows me to grab by index so when I'm plotting it just uses the column index and plots them.
                    axes[x,y].plot(bad_data.iloc[:, y], bad_data.iloc[:, x], linestyle='none', marker='.', color='red', mfc='none')
                    axes[x,y].plot(good_data.iloc[:, y], good_data.iloc[:, x], linestyle='none', marker='.', color='green', mfc='none')

    # Label the diagonal subplots...
    for i, label in enumerate(names):
        axes[i,i].annotate(label, (0.5, 0.5), xycoords='axes fraction',
                ha='center', va='center')
    if (save_plot == True):
        plt.savefig('scatter_plot.png')
    return fig


In [None]:
good_threshold = 8
bad_threshold = 3

fig = plot_scatter_matrix(data,good_threshold,bad_threshold,save_plot=False)
fig.suptitle('Simple Scatterplot Matrix')
plt.show()

b) Observations:
 1-Dimensional Analysis
 
    
    1) Lower volatile acidity trended to higher quality
    2) Low chlorides led trended to higher quality
    3) Higher alcohol content trended to higher quality
    4) Higher sulphates trended to higher quality
    5) Lower density trended to higher quality
    6) Lower pH trended to higher quality
    7) Most low quality wines had low citric acid
    8) Lower Sulfer dioxide trended to lower quality
 
 2-Dimensional Analysis
    * plots with greater separation between green and red show that these two factors have a relationship to quality 
    * poolings of green or red are also good indicators to relationship of quality
    
    1) Higher alcohol content with low volatile acidity led to higher ratings
    2) Lower Density and higher citric acid showed a trended to higher ratings
    3) High citric acid and low chlorides showed higher ratings
    
I think that volatile acidity, chlorids, sulphates, and alcohol content will be the most useful factors in deciding quality of wine because of the severity of their grouping and large disparity between high and low wines.
    

# V.2 Learning to Perceptron

# a) Implement a perceptron that:
    
       * Has randomly initialized weights and bias
       * Uses the Rosenblatt perceptron learning rule (with changeable learning rate)
       * Utilizes the heaviside step activation function (discrete version)

Predictions

Formula for activation function:
    * activation = (weight(0) * X(0)) + (weight(n) * x(n)) + bias

Heaviside step activation function:
    * if activation is > 0 then it predicts 1 if else returns 0.
    * simple binary classification

In [None]:
def predict(row, weights):
    activation = weights[0]
    for i in range(len(row)-1):
        #iterate through inputs and multiply them to corresponding weights
        activation += weights[i + 1] * row[i]
        
    #use a transfer function to change value to 1 or 0.
    return 1.0 if activation >= 0.0 else 0.0

# b) Implement a function to train your perceptron

    • Have a way to specify number of training epochs
    • Train your perceptron until it makes no errors, if training epochs is set to 0,
    • Have a way to specify learning rate.
    • Return a list of python tuples containing (performance)

Training

There are 3 loops we need to perform in the function:

    1) Loop over each epoch.
    2) Loop over each row in the training data per epoch.
    3) Loop over each weight per feature in row.
    4) Calculate the prediction with the respective weights to input
    5) Recalculate the weights using this function:
            New Weight = Old Weight + (l_rate * (x) * error)

In [None]:
# Estimate Perceptron weights using stochastic gradient descent

#The perceptron learning rule basically just adds the actual value 
def train_perceptron(train, l_rate, n_epoch):
    performance = []

    #random weights
    random.seed(1)
    weights = [random.uniform(0, 1) for i in range(len(train[0]))]
    
    for epoch in range(n_epoch):
        sum_error = 0.0
        for row in train:
            prediction = predict(row, weights)
            error = row[-1] - prediction
            sum_error += abs(error)
            weights[0] = weights[0] + l_rate * error
            for i in range(len(row) - 1):
                weights[i + 1] = weights[i + 1] + (l_rate * row[i] * error)
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
        performance.append((epoch, sum_error, weights[1:], weights[0]))
    if n_epoch <= 0:
        epoch = 0
        sum_error = 1.0
        while sum_error != 0:
            sum_error = 0.0
            for row in train:
                prediction = predict(row, weights)
                error = row[-1] - prediction
                sum_error += abs(error)
                weights[0] = weights[0] + l_rate * error
                for i in range(len(row) - 1):
                    weights[i + 1] = weights[i + 1] + (l_rate * row[i] * error)
            print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))            
            performance.append((epoch, sum_error, weights[1:], weights[0]))
            epoch += 1 
    print(weights)
    return performance

In [None]:
def dataframe_to_list(data):
        #convert to list
        return data.values.tolist()

In [None]:
def parse_dataframe(data, good_threshold, bad_threshold, columns, set_range=False):
    new_data = data.copy()

    if (set_range==False):
        #Covert all labels to 1 or 0 with threshold of good and bad
        new_data['good_bad'] = [1 if x >= good_threshold else 0 for x in new_data['quality']]
    else:
        #Covert all labels to 1 or -1 with threshold of good and bad
        new_data['good_bad'] = [1 if x >= good_threshold else -1 for x in new_data['quality']]
    
    #Grab all data higher than 8 or lower than 3 in quality.
    new_data = new_data.loc[(data['quality'] >= good_threshold) | (data['quality'] <= bad_threshold)]
    
    #Grab only the specified columns
    new_data = new_data.loc[:, columns]
    
    new_data = new_data.reset_index(drop=True)
    return(new_data)


In [None]:
test_train = parse_dataframe(data, good_threshold=8, bad_threshold=3, columns=['pH', 'alcohol', 'good_bad'])
test_train = dataframe_to_list(test_train)
test_train

In [None]:
performance = train_perceptron(test_train, l_rate=5, n_epoch=0)

In [None]:
# for row in test_train:
#     prediction = predict(row, weights)
#     print("Expected=%d, Predicted=%d" % (row[-1], prediction))

In [None]:
#Format (epoch, error, weights, bias)
print(performance[0])

# c) Plotting performance
    
    • The first plot should plot the number of errors your perceptron made as a function of epoch. Be careful with how you calculate errors!
    • The second plot should plot the decision boundary of your perceptron and also show ‘good’ and ‘bad’ wine data points on the final training epoch. This second plot should also shade ‘good’ and ‘bad’ areas!
    • Your function should allow the user to specify a specific epoch and see what the decision boundary of the perceptron was on that epoch. If a negative epoch is given, cause the plots to show the last epoch.

In [None]:
def frange(x, y, jump):
    while x < y:
        yield x
        x += jump

def eval_y_per_x(formula, x_values):
    y_values = []
    for x in x_values:
        y_values.append(eval(formula))
    return y_values

def graph(formula, wine_data, ax2):     
    x = wine_data.alcohol.tolist()
    x = [i for i in frange(min(x), max(x), .1)]
    y = eval_y_per_x(formula, x)
    d_min = [min(y) for l in range(0, len(x))]
    d_max = max(y)
    ax2.plot(x, y, linestyle='dashed', label='decision boundary')
    ax2.fill_between(x, y, d_min, facecolor='lightgreen', alpha=0.5)
    ax2.fill_between(x, y, d_max, facecolor='pink', alpha=0.5)

In [None]:
#if epoch is equal to -1 then it shows the line for last epoch
#please put a valid epoch under the number of epoch's attempted

def plot_performance(performance, wine_data, good_thresh, bad_thresh, epoch, save_plot=False):
    #Separate Data to good and bad wine depending on threshold
    good_data = wine_data.loc[data['quality'] >= good_thresh]
    bad_data = wine_data.loc[data['quality'] <= bad_thresh]
   
    listofepochs = [x[0] for x in performance]
    error = [x[1] for x in performance]
    
    #find length of epoch's so I can use last one if -1
    if (epoch >= len(listofepochs)):
        print("Please Put Valid Epoch. If you wish for last epoch put -1")
        return
    
    if (epoch < 0):
        epoch = len(listofepochs) - 1
        
    fig, (ax1, ax2) = plt.subplots(figsize=(12, 4), ncols=2)
    ax1.plot(listofepochs, error)
    ax1.set_title('Error / Epoch Graph')
    ax1.set_ylabel('Sum_Error')
    ax1.set_xlabel('Epoch #')

    ax2.scatter(good_data.alcohol.tolist(), good_data.pH.tolist(), color = 'green', label='good')
    ax2.scatter(bad_data.alcohol.tolist(), bad_data.pH.tolist(), color = 'red', label='bad')
    formula = '({}*x + {})/-{}'.format(performance[epoch][2][1], performance[epoch][3], performance[epoch][2][0])

    graph(formula, wine_data, ax2)
    ax2.set_title('Decision boundary on epoch: {}'.format(epoch))
    ax2.set_ylabel('pH')
    ax2.set_xlabel('alcohol')
    ax2.legend()
    if (save_plot == True):
        plt.savefig('batchsize_35_graph.png')

In [None]:
plot_performance(performance, data, good_thresh=8, bad_thresh=3, epoch=10, save_plot=False)

d) The reason it takes so many epochs to train is because the data is not normalized (or represented between 0 and 1). It takes alot longer for the bias and weights to calibrate for numbers that are larger than one. 


# Feature Scaling

Function for normalizing is ((X - min) / (max - min))    

In [None]:
norm_data = data.copy()
features = [x for x in norm_data.columns]
for feature in features:
    maxim = norm_data[feature].max()
    minim = norm_data[feature].min()
    print(feature, maxim, minim)
    if feature != 'quality':
        norm_data[feature] = norm_data[feature].apply(lambda x: (x - minim) / (maxim - minim))
    
# #Double Check if it was normalized
# norm_data.describe()

In [None]:
norm_train1 = parse_dataframe(norm_data, 8, 3, ['pH', 'alcohol', 'good_bad'])
norm_train1 = dataframe_to_list(norm_train1)

In [None]:
norm_train1

In [None]:
l_rate = 5
n_epoch = 0
performance = train_perceptron(norm_train1, l_rate, n_epoch)

In [None]:
plot_performance(performance, norm_data, 8, 3, -1, save_plot=False)

# V.3 My fair ADALINE

# a) Results after lowering good threshold to 7 and raising bad threshold to 4
       
       * Look at the graph below and see how interlaced both datasets are. There is no way to make a linear classifier that can differentiate between the two without errors
       * Technical term is linearly inseparable problems

In [None]:
norm_train_new_threshold = parse_dataframe(norm_data, good_threshold=7, bad_threshold=4, columns=['pH', 'alcohol', 'good_bad'], set_range=True)
norm_train_new_threshold = dataframe_to_list(norm_train_new_threshold)
print(len(norm_train_new_threshold))

In [None]:
performance = train_perceptron(norm_train_new_threshold, l_rate=.1215, n_epoch=250)

In [None]:
plot_performance(performance, norm_data, good_thresh=7, bad_thresh=4, epoch=-1, save_plot=False)

# b) Implement an ADALINE that:

    • Has randomly initialized weights and bias
    • Uses a linear activation function and some kind of quantizer
    • Uses the Widrow-Hoff learning rule

 Quantizer = converts activation to 1 or 0 based off the value
 
 Widrow-Hoff learning = uses activation value instead of a threshold value for error calculation
 
 
 Stochastic Gradient Descent
    
    Gradient Descent is the process of minimizing error by following the gradients of the cost function.
   
 Similarities between perceptron and adaline model:

    * they are both binary classifiers
    * both have a linear decision boundary
    * both can learn iteratively, sample by sample
    * both use a threshold function
    
 Difference between perceptron and adaline model
    
    * both are class
    * activation functions returns continuous values or floats between 0 and 1 that are more like probablities
    * These continuous values can be used to learn how close or far this prediction was to the correct value
    

In [None]:
def predict_continuous(row, weights):
    activation = weights[0]
    for i in range(len(row)-1):
        #iterate through inputs and multiply them to corresponding weights
        activation += weights[i + 1] * row[i]
    return activation, 1.0 if activation >= 0.0 else -1.0

# c) Create a training function for adaline

    • Take in your red wine data as a parameter
    • Have a way to specify number of training epochs
    • If training epochs is set to 0, your ADALINE should train until it converges
    on a good set of weights.
    • Have a way to specify learning rate.
    • Have an option to perform either online learning or batch learning

In [None]:
# Estimate Perceptron weights using stochastic gradient descent
def train_aladine(train, l_rate, n_epoch, batch=1):

    #random weights
    random.seed(2)
    weights = [random.uniform(0, 1) for i in range(len(train[0]))]
    
    previous_error = 99999
    sum_error = 0
    performance = []
    batch_counter = 0
    error = 0.0
    
    for epoch in range(n_epoch):
        sum_error = 0.0
        for row in train:
            batch_counter += 1
            probability, prediction = predict_continuous(row, weights)
            error += row[-1] - probability
            sum_error += abs(row[-1] - prediction)
#             print(row[-1], prediction, sum_error)
            if (batch_counter == batch):
                batch_counter = 0
                weights[0] = weights[0] + l_rate * error
                for i in range(len(row) - 1):
                    weights[i + 1] = weights[i + 1] + l_rate * error * row[i]
                error = 0
        print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))
        performance.append((epoch, sum_error, weights[1:], weights[0]))
    if n_epoch <= 0:
        epoch = 0
        while (previous_error >= sum_error):
            if (epoch != 0):
                previous_error = sum_error
            sum_error = 0.0
            for row in train:
                batch_counter += 1
                probability, prediction = predict_continuous(row, weights)
                error += row[-1] - probability
                sum_error += abs(row[-1] - prediction)
                if (batch_counter == batch):
                    batch_counter = 0
                    weights[0] = weights[0] + l_rate * error
                    for i in range(len(row) - 1):
                        weights[i + 1] = weights[i + 1] + l_rate * error * row[i]
                error = 0
            print('>epoch=%d, lrate=%.3f, error=%.3f' % (epoch, l_rate, sum_error))            
            performance.append((epoch, sum_error, weights[1:], weights[0]))
            epoch += 1
            
    print(weights)
    return performance

In [None]:
norm_train3 = parse_dataframe(norm_data, good_threshold=7, bad_threshold=4, columns=['pH', 'alcohol', 'good_bad'], set_range=True)
norm_train3 = dataframe_to_list(norm_train3)

In [None]:
performance = train_aladine(norm_train3, l_rate=.006, n_epoch=400, batch=35)

In [None]:
plot_performance(performance, norm_data, good_thresh=7, bad_thresh=4, epoch=-1, save_plot=False)

In [None]:
# bias = performance[-1][3]
# weights = list(performance[-1][2])
# print(bias, weights)

# weights.insert(0, bias)
# print(weights)

# for row in norm_train3:
#     probability, prediction = predict(row, weights)
#     print("Expected=%d, Predicted=%d" % (row[-1], prediction))

# d)
    
    batch size = 1, learning rate = .00212, epoch = 1000
        these hyperparameters actually did very well but didn't give me my best results because the batchsize only allows for one single data points to change the weights ,so the line can't pass into the center of the plots and grab more red and blues despite a few single errors
        
    batch size = 15, learning rate = .00212, epoch = 1000
        this did the best it was a good balance of constant learning rate and a solid batch size that allows the line to move into the mass without too much concern for error over each individual input value.
        
    batch size = 35, learning_rate = 0.006, epoch = 1000
        these settings did okay but the decision line began to get lost within the plot and looks like it couldn't converge on a good set of weights because of how large the batches are. lowest error I got was about 90 with these settings.

# V.4 Advanced wine sampling and resampling

a) Write a function that uses the holdout method to partition the red wine data into a
training and a validation set. The function should take a parameter to adjust the
proportion of training to validation data. It should return a tuple containing:
(training_pandas_dataframe, validation_pandas_dataframe)

In [None]:
# california_housing_dataframe = california_housing_dataframe.reindex(
#     np.random.permutation(california_housing_dataframe.index))

def split_train_validation(data,percent_train_data):
    if (percent_train_data >= 1 or percent_train_data <= 0):
        raise Warning("The percentage must be between 0 and 1")
    train_data = data.iloc[0:round(len(data)*percent_train_data), :]
    validation_data = data.iloc[round(len(data)*percent_train_data):, :]
    return(train_data, validation_data)


train_data, validation_data = split_train_validation(norm_data, percent_train_data=0.8)
print("Total_data {}, training_data {}, validation_data {}".format(len(norm_data), len(train_data), len(validation_data)))

b) Write a function that generates a k-fold cross validation dataset.

What is k-fold? It is a way of creating training and validation while using your whole data set.
The K stands for the number of groups to split the data set. A proper k-fold cross validation will separate 1 group into validation and the rest into training. 

In [None]:
def k_fold_dataset(data, k, randomize=False):
    if (k == 0):
        raise Warning("the k-fold is 0!")
    groups = []
    previous_row = 0
    training_data = pd.DataFrame()
    return_values = []
    
    if (randomize == True):
        data = data.sample(frac=1).reset_index(drop=True)

    #split into k groups
    for i in range(1, k+1):
        index = round(len(data)/k*i)
        groups.append(data.iloc[previous_row: index,:])
        previous_row = index
    for i in range(len(groups)):
        training_data = pd.DataFrame()
        validation_data = groups[i]
        for x in range(len(groups)):
            if x != i:
                training_data = training_data.append(groups[x])
        return_values.append((training_data, validation_data))
    return (return_values)

In [None]:
norm_train4 = parse_dataframe(norm_data, good_threshold=7, bad_threshold=4, columns=['pH', 'alcohol', 'good_bad'], set_range=True)

In [None]:
a = k_fold_dataset(norm_train4, 10, randomize=True)
count = 0
print(len(norm_train4))
for i,x in a:
    count +=1
    print (i, x)

c) What effects does changing learning rate and number of training epochs have on the
    ADALINE when evaluated via k-fold cross-validation? To address this question,
    you should write (or modify) a function that will train and assess the ADALINE
    on each training and cross-validation fold produced by your k-fold function.

In [None]:
#evaluate model with different k folds
#So i have to train of training_data then I have to run a percent correct on the test data as an evaluation tool
#Write a function that will train on each training and cross validation fold. Basically a for loop that will go through training data
#I should give both the validation and training to the function and compare the losses for each.

In [None]:
# Estimate Perceptron weights using stochastic gradient descent
def train_weights_with_validation(train, validation, l_rate, n_epoch, batch=1):
    performance = []
    batch_counter = 0
    error = 0.0
    #random weights
    random.seed(1)
    weights = [random.uniform(0, 1) for i in range(len(train[0]))]
    
    previous_error = 99999
    sum_train_error = 0
    sum_valid_error = 0
    
    for epoch in range(n_epoch):
        #reset errors for each epoch.
        sum_train_error = 0.0
        sum_valid_error = 0.0

        #Make predictions per row and calculate error to update after each batch
        for row in train:
            batch_counter += 1
            probability, prediction = predict_continuous(row, weights)
            error += row[-1] - probability
            sum_train_error += abs(row[-1] - prediction)
            if (batch_counter == batch):
                batch_counter = 0
                weights[0] = weights[0] + l_rate * error
                for i in range(len(row) - 1):
                    weights[i + 1] = weights[i + 1] + l_rate * error * row[i]
                error = 0
                
        #Make predicitons for validation to do evaluation of model
        for row in validation:
            probability, prediction = predict_continuous(row, weights)
            sum_valid_error += abs(row[-1] - prediction)
        
        #print performance
#         print('>epoch=%d, lrate=%.3f, train_error=%.3f, valid_error=%.3f' % (epoch, l_rate, sum_train_error/len(train), sum_valid_error/len(validation)))
        performance.append((epoch, sum_train_error, sum_valid_error/len(validation), weights[1:], weights[0]))
    if n_epoch <= 0:
        epoch = 0
        while (previous_error >= sum_train_error):
            #Training continues until the error begins to go back up.
            if (epoch != 0):
                previous_error = sum_train_error
                
            #reset errors for each epoch.
            sum_train_error = 0.0
            sum_valid_error = 0.0
            
            #Make predictions per row and calculate error to update after each batch
            for row in train:
                batch_counter += 1
                probability, prediction = predict_continuous(row, weights)
                error += row[-1] - probability
                sum_error += abs(row[-1] - prediction)
                if (batch_counter == batch):
                    batch_counter = 0
                    weights[0] = weights[0] + l_rate * error
                    for i in range(len(row) - 1):
                        weights[i + 1] = weights[i + 1] + l_rate * error * row[i]
                error = 0
            
            #Make predicitons for validation to do evaluation of model
            for row in validation:
                probability, prediction = predict_continuous(row, weights)
                sum_valid_error += abs(row[-1] - prediction)
            
#             print('>epoch=%d, lrate=%.3f, train_error=%.3f, valid_error=%.3f' % (epoch, l_rate, sum_train_error/len(train), sum_valid_error/len(validation)))
            performance.append((epoch, sum_train_error, sum_valid_error/len(validation), weights[1:], weights[0]))
            epoch += 1
            
    return performance

In [None]:
count = 0
sum_valid_error = 0.0

for (train, valid) in a:
    print("{}-Fold Cross Validation".format(count))
    train = dataframe_to_list(train)
    valid = dataframe_to_list(valid)
    performance = train_weights_with_validation(train, valid, l_rate=.5, n_epoch=100, batch=1)
    valid_error = performance[-1][2]
    print("Valid Error : {}".format(valid_error))
    sum_valid_error += valid_error
    count += 1
print(sum_valid_error/count)

c) observations: 

    * When I changed the learning rate to .02 from .2 the validation error dropped down from .3785 to .3071 which is a huge improvement. If I put the learning rate at .5 the validation error goes up to .42857 because it trains too quickly on the dataset and doesn't fine tune the weights.
    
    * larger epoch number led to larger validation error because of overfitting.


In [None]:
norm_data.columns

In [None]:
norm_train5 = parse_dataframe(norm_data, 7, 4, ['fixed acidity', 'volatile acidity', 'citric acid',
       'chlorides', 'density', 'pH', 'sulphates', 'alcohol', 'good_bad'])
norm_train5 = dataframe_to_list(norm_train5)
performance = train_perceptron(norm_train5, l_rate=.01, n_epoch=100)

In [None]:
norm_train5 = parse_dataframe(norm_data, 7, 4, ['fixed acidity', 'volatile acidity', 'citric acid',
       'chlorides', 'density', 'pH', 'sulphates', 'alcohol', 'good_bad'], set_range=True)
norm_train5 = dataframe_to_list(norm_train5)
performance = train_aladine(norm_train5, l_rate=.05, n_epoch=75, batch=10)

In [None]:
# bias = performance[-1][3]
# weights = list(performance[-1][2])
# print(bias, weights)

# weights.insert(0, bias)

# for row in norm_train5:
#     prediction = predict(row, weights)
#     print("Expected=%d, Predicted=%d" % (row[-1], prediction))

a) 

    * It trains well on all the data for the perceptron model ,but the aladine is having trouble. With all the data and a learning rate of .01 the model trains decently and get around 28 wrong in a dataset of 280.
    * Aladine doesn't produce as good outputs on the same dataset as the perceptron, but it does train if I put n_epoch at 30 and l_rate at .05 for batch size 10 or less.

b) 

    * For 1 dimensions it is a point on a line
    * For 2 dimensions it is a line on a point
    * For 3 dimensions it is a plane in a 3 dimensional space
    * I can't visually conceptualize beyond 3 dimensions lol. 
      https://www.youtube.com/watch?v=JkxieS-6WuA

# V.6 Marvin’s rebuttal

a) While not a wine. . . find a way to successfully classify the Pan-Galactic Gargle Blaster
dataset. Show that your perceptron or ADALINE successfully classifies the PanGalactic
Gargle Blaster data set by plotting the decision boundary and also show
‘good’ and ‘bad’ Gargle Blaster data points.

In [None]:
gargle_blaster = pd.read_csv("input/Pan Galactic Gargle Blaster.csv")
gargle_blaster

In [None]:
plot_scatter_matrix(gargle_blaster, 8, 3, quality=False)

Normalize the dataset
Put all the data points about the origin (0, 0) and then square everything
This will allow for me to fit a linear equation to divide the data

In [None]:
norm_gargle = gargle_blaster.copy()
features = [x for x in norm_gargle.columns]
for feature in features:
    maxim = max(norm_gargle[feature])
    minim = min(norm_gargle[feature])
    print(feature, maxim, minim)
    if feature != 'quality':
        norm_gargle[feature] = norm_gargle[feature].apply(lambda x: (x - minim) / (maxim - minim))
        norm_gargle[feature] = norm_gargle[feature].apply(lambda x : (x - 0.5)**2)
    
# #Double Check if it was normalized
# norm_gargle.describe()

In [None]:
norm_gargle.describe()

In [None]:
def plot_data(gargle_blaster, good_thresh, bad_thresh):
    #Separate Data to good and bad wine depending on threshold
    good_data = gargle_blaster.loc[(gargle_blaster['quality']) >= good_thresh]
    bad_data = gargle_blaster.loc[(gargle_blaster['quality']) <= bad_thresh ]
   
    fig, ax2 = plt.subplots(figsize=(4, 4), ncols=1)


    ax2.plot(good_data.iloc[:, 0], good_data.iloc[:, 1], linestyle='none', marker='.', color='green', mfc='none', label='good')
    ax2.plot(bad_data.iloc[:, 0], bad_data.iloc[:, 1], linestyle='none', marker='.', color='red', mfc='none', label='bad')

    ax2.set_ylabel('wonderflonium')
    ax2.set_xlabel('fallian marsh gas')
    ax2.legend()


In [None]:
plot_data(norm_gargle, 6, 5)

In [None]:
gargle_train = parse_dataframe(norm_gargle, 6, 5, ['wonderflonium', 'fallian marsh gas','good_bad'])
gargle_train = dataframe_to_list(gargle_train)
gargle_train


In [None]:
performance = train_perceptron(gargle_train, l_rate=.505, n_epoch=25)

In [None]:
def graph_gargle(formula, gargle, ax2):     
    x = gargle['fallian marsh gas'].tolist()
    x = [i for i in frange(min(x), max(x), .1)]
    y = eval_y_per_x(formula, x)
    d_min = [min(y) for l in range(0, len(x))]
    d_max = max(y)
    ax2.plot(x, y, linestyle='dashed', label='decision boundary')
    ax2.fill_between(x, y, d_min, facecolor='lightgreen', alpha=0.5)
    ax2.fill_between(x, y, d_max, facecolor='pink', alpha=0.5)

In [None]:
#if epoch is equal to -1 then it shows the line for last epoch
#please put a valid epoch under the number of epoch's attempted
import math 

def plot_gargle(performance, gargle_blaster, good_thresh, bad_thresh, epoch, save_plot=False):
    #Separate Data to good and bad wine depending on threshold
    good_data = gargle_blaster.loc[(gargle_blaster['quality']) >= good_thresh]
    bad_data = gargle_blaster.loc[(gargle_blaster['quality']) <= bad_thresh ]
   
    listofepochs = [x[0] for x in performance]
    error = [x[1] for x in performance]
    
    #find length of epoch's so I can use last one if -1
    if (epoch >= len(listofepochs)):
        print("Please Put Valid Epoch. If you wish for last epoch put -1")
        return
    
    if (epoch < 0):
        epoch = len(listofepochs) - 1
        
    fig, (ax1, ax2) = plt.subplots(figsize=(12, 4), ncols=2)
    ax1.plot(listofepochs, error)
    ax1.set_title('Error / Epoch Graph')
    ax1.set_ylabel('Sum_Error')
    ax1.set_xlabel('Epoch #')

    ax2.plot(good_data.iloc[:, 0], good_data.iloc[:, 1], linestyle='none', marker='.', color='green', mfc='none', label='good')
    ax2.plot(bad_data.iloc[:, 0], bad_data.iloc[:, 1], linestyle='none', marker='.', color='red', mfc='none', label='bad')
    
    formula = '({}*x + {})/-{}'.format(performance[epoch][2][1], performance[epoch][3], performance[epoch][2][0])

    graph_gargle(formula, gargle_blaster, ax2)
    ax2.set_title('Decision boundary on epoch: {}'.format(epoch))
    ax2.set_ylabel('wonderflonium')
    ax2.set_xlabel('fallian marsh gas')
    ax2.legend()
    if (save_plot == True):
        plt.savefig('gargle_graph.png')

In [None]:
gargle_blaster.head()

In [None]:
plot_gargle(performance, norm_gargle, good_thresh=6, bad_thresh=5, epoch=-1, save_plot=False)