In [1]:
#importing the libraries that will be used 
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

In [2]:
#the data available is as a text file so i've assigned the column headers for the dataframe 
data = pd.read_csv('data_banknote_authentication.txt',header=None)
data.columns = ["variance", "skewness", "curtosis", "entropy ","class"]

In [3]:
df = data.copy(deep=True)

In [4]:
#making the class labels -1 where we have 0 in the dataset
df.loc[df['class'] == 0] = -1

In [5]:
# here the we can see a significant difference in the max values
df.describe()

Unnamed: 0,variance,skewness,curtosis,entropy,class
count,1372.0,1372.0,1372.0,1372.0,1372.0
mean,-1.386115,-0.997144,0.399741,-1.109658,-0.110787
std,1.326021,3.602272,3.840276,1.385713,0.994207
min,-7.0421,-13.7731,-5.2861,-7.5887,-1.0
25%,-1.5636,-1.0,-1.0,-1.0,-1.0
50%,-1.0,-1.0,-1.0,-1.0,-1.0
75%,-1.0,-0.247285,-0.084047,-0.99397,1.0
max,2.3917,9.6014,17.9274,2.1353,1.0


1.a

In [6]:
#takes two arguments m = array of mistakes k = artificial limit and return if the algorithm has converged
def converge(m,k):
    for i in range(len(m)-k+1):
        temp = [] # temorary array to check the sum
        for j in range(i,i+k):#we take every elements in the range(i,i+k) from mistakes array and store it in the temp and if the sum of element of array temp is zero then all the elements were zero
            temp.append(m[j])# meaning there were no updates for previos k iterations at the current iteration of the algorithm                            
        if(sum(temp)==0):#if sum is 0 meaning the algorithm has converged and we return true and the algorithm below stops loop and returns the current w
            return True
    return False # if not converged then we continue the algorithm 

In [7]:
# this is the Stochastic sub gradient descent which takes 4 arguments x: dataset,y=label,epoch = number of times the 
#for loop runs and return the learned weights
def ssgd(X,Y,epoch,k):
    mistakes = []
    W = np.zeros(X.shape[1])# here we make an array of zeros of size equal to the number of columns of the passed dataset
    for j in range(epoch):
        index = np.random.randint(0,X.shape[0])# here we find a random index between 0 and number of rows of the given dataset
        x = X.iloc[index].to_numpy()#after finding the index we make an array x with the row of the dataset at that index
        y = Y[index]# we take the class label from the same index of passed label     
        value = np.dot(W,x)*y
        if(value <= 0): #here we check if we made a mistake , here i've used <= rather < because the if the value is 0 then the weights will never change
            mistakes.append(1) # we append a 1 if there's no mistake in that iteration          
            W = W + y*x
        else:
            mistakes.append(0)# we append a 0 if there's no mistake in that iteration
        if(converge(mistakes,k)==True):# if converged we return that W
            return W
    return W

1.b

In [8]:
#here only the features need to be normalised the class label is skipped, z-score normalisation is used where the 
# mean of the column is subtracted from each datapoint and divided by the standard deviation of the column 
new_col = ["variance", "skewness", "curtosis", "entropy "]
for col in new_col:
    df[col] = (df[col]-df[col].mean())/df[col].std()

In [9]:
#this function takes two arguments w is the calculated weight and x is the row of the dataset for which the weight w is 
#calculated and returns the lable for the given weight and and the row 
def predict(w,x):
    if np.dot(w,x)<0:#here if the dot product is less 0 we predict -1 label else we predict 1 label
        return -1
    return 1

In [10]:
#this is for test/train split
new_df = df.sample(frac=1)#first we shuffle the whole dataset
train_size = int(0.8 * len(df))# this is size upon which we will split the entire dataset
train_set = new_df[:train_size].copy()# train set which will include the top 80% data
test_set = new_df[train_size:].copy()# test set which will include the  last 20% data

In [11]:
score = [] # this list will score the error values when we will test with the k-th held out set
global_weights = [] # this list will store the weights which was used for predicting the label

In [12]:
#this function stores the error in each iteration of the loop in the score list above and takes two arguments
# test is the unseen test set on which the error will be calculated and w is the learned weight of that fold 
def evaluation(test,weights):
    error = 0 # to get the total error for the current test set
    act = []# stores the actual labels
    pred = [] # stores the predicted labels
    for i,j in test.iterrows():# we iterate over every row
        x = j.tolist()# we make a list of the current row
        actual_y = int(x[-1]) # we get the actual label from the list above
        x = x[:-1] # now that we have taken the actual class label we will discard the last element to match the requirements
        pred_y = predict(weights,x) # here we will get the prediction for the learned weight and the current row
        act.append(actual_y)#storing the label of the current row
        pred.append(pred_y)#storing the predicted label for the the current row
    for k in range(len(pred)):
        error+=max(0,-act[k]*pred[k]) #in each iteration we calcuate the perceptron loss and add it to the local variable error
    score.append(error)#append the overall error in the score list 
    global_weights.append(weights) # stores the weights for which prediction was made

1.c

In [13]:
#k-fold cross validation takes two arguments as df = dataset for which cv is to performed and k is the number of folds
#this function will update the scores and the global weights above
def CV(df,k):
    folds = np.array_split(df.sample(frac=1), k)# we use the numpy.array_split() to divide the passed dataset df into k folds
    for i in folds:#iterate over every fold
        test = i # current fold(current small dataset) will be used for testing
        train = pd.DataFrame()#intilise a blank dataframe to later store the (k-1) folds for training
        for j in folds:
            if(j.equals(test)==False):#here we append the current dataset into train above if they dont match
                train = train.append(j,ignore_index=True)
        new_train = train.copy(deep=True)
        y = new_train['class'].to_list()#we take the last column class as a list in y
        new_train.drop(new_train.columns[len(new_train.columns)-1], axis=1, inplace=True)# drop the last column to 
                                                                                        # meet the requirements
        new_train.reset_index(inplace=True)#reset index because in the ssgd function index plays a significant role
        new_train.pop('index')#a index columns is added to the front of the dataset which is removed
        weights = ssgd(new_train,y,100,6)#here we learn the weight for the current train set
        evaluation(test,weights) #this function call will update the score anf global_wights list above for 
                                #the held out test set       

In [14]:
# CV funstion is called on the train_set(80% of the dataset) with k=10(it is recommedend for k=10 for larger datasets) 
#so we can get the score of the k folds , for every fold that we make we train on the other (k-1) folds to get the 
#weights after which we test on the held out fold for the current iteration to get the error and update the score list
CV(train_set,10)

  train = train.append(j,ignore_index=True)
  train = train.append(j,ignore_index=True)
  train = train.append(j,ignore_index=True)
  train = train.append(j,ignore_index=True)
  train = train.append(j,ignore_index=True)
  train = train.append(j,ignore_index=True)
  train = train.append(j,ignore_index=True)
  train = train.append(j,ignore_index=True)
  train = train.append(j,ignore_index=True)
  train = train.append(j,ignore_index=True)


In [15]:
# mean value of the error
print('Mean error on the train set is {}'.format(sum(score)/len(score)))

Mean error on the train set is 19.6


In [16]:
#index of the minimum value element of score list
min_idx = score.index(min(score))

In [17]:
#since the size of score and global_weights lists are same we can get for which weight we get the least error
optimal_weights = global_weights[min_idx]

1.d

In [18]:
# this function finds the f1 score on the held out set test_set(20% of the wohole dataset) and takes two arguments
# test which is the unseen data and the optimal weight of our best model(least error)
def calculate(test,weights):
    tp = 0 # for keeping track of true postive predictions
    fp = 0 # for keeping track of false postive predictions
    fn = 0 # for keeping track of false negative predictions
    for i,j in test.iterrows(): # we iterate through every row of the test set
        x = j.tolist() # take a row of the test_set
        actual_y = int(x[-1]) #take the label
        x = x[:-1] #remove the last element(class label)
        pred_y = predict(weights,x) # get the predicted label for the optimal weight and current row
        if actual_y == 1:
            if pred_y == 1:
                tp+=1  # update the tp value if the actual lable belongs to the positive class and the prediction 
                        #is also positive
            else:
                fp+=1 #update the fp value if the actual lable belongs to the positive class but the prediction is negative
        elif actual_y == -1 & pred_y == 1:
            fn+=1 # update the fn value if the actual lable belongs to the negative class but the prediction is positive
    p = tp/(tp+fp) # precision p is the number of correctly classified positive examples divided by the total number 
                   #of examples that are classified as positive
    r = tp/(tp+fn) # recall r is the number of correctly classified positive examples divided by the total number of 
                    #actual positive examples in the test set
    return(2*p*r/(p+r)) # harmonic mean of precision and recall will give us the accuracy of the model on the 
                        #dataset(held out test_set)            

In [21]:
#here we will get the f1-score , higher the f1-score better the classifier
print('The F1-score is {}'.format(calculate(test_set,optimal_weights)))

The F1-score is 0.7729468599033816
