# NumPy

**SAAD SHAIKH, 20070328**

In [43]:
# Importing the required Libraries

import numpy as np
from collections import Counter
import operator
import matplotlib.pyplot as plt
from math import *
import pandas as pd
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [44]:
# available sampe data files
# classification: class1.csv, class2.csv
# regression: regr1.csv, regr2.csv

class_file_name = "class1.csv" #classification filename
regr_file_name  = "regr1.csv" #regression filename

The sample data is structured in the following structure:
$$
\begin{array}{cccc}
\vec{y} & \vec{x}_1 & \vec{x}_2 & \cdots \\ \hline 
y_1 & x_{11} & x_{12} & \cdots \\
y_2 & x_{21} & x_{22} & \cdots \\
y_3 & x_{31} & x_{32} & \cdots \\
\vdots & \vdots & \vdots & 
\end{array}
$$

where $\vec{y}$ is a column-vector of responses and $\vec{x}_1$, $\vec{x}_2$, $\ldots$ are column-vectors of predictors. For the classification problem $y_i$'s take integer values: 1, 2, 3, etc. For the regression problem $y_i$'s are real numbers.

**Important!** Your code must work well for any data file having this structure.

## Part 1 - KNN classification

### 1.1 KNN classification algorithm

In this section you should write a function ``knn_classify(test, train, k)`` that takes train and test data as numpy ndarrays, and a k-value as an integer, and returns the class-values of the test data.

In [45]:
class_file = pd.read_csv('class1.csv')
cols = ['x1', 'x2', 'y']
df = class_file[cols].sample(frac=1).values

In [46]:
reg_file_name = pd.read_csv('regr1.csv')
cols = ['x1', 'x2', 'y']
df_reg = reg_file_name[cols].sample(frac=1).values

In [47]:
# Defining Function for calculating Euclidean Distance

def euclidean_distance(x1,x2):
    distance = 0.0
    for i in range(len(x1)-1):
        distance += (x1[i]-x2[i])**2
    return sqrt(distance)

# Defining function to split the data into train, validation and test sets.

def data_split(X,train_size=0.6,test_size=0.2):
    val_size = 1-train_size-test_size
    train,val,test = X[:int(len(X)*train_size)],X[int(len(X)*train_size):int((len(X)*(train_size+val_size)))],X[int((len(X)*(train_size+val_size))):]
    return(train,val,test)

#Defining function to classify using KNN algorthim

def knn_classify(test,train,k):
    p = []
    for i in test: 
        D = []
        for idx,j in enumerate(train):    #iteration of data between train data
            dist = euclidean_distance(j, i)    #calculating Euclidean distance
            D.append([dist,j]) 
        distances = sorted(D, key=lambda x: x[0]) #sorting distances
        nb = [i[1] for i in D[:k]]
        labels = sorted(dict(Counter([i[-1] for i in nb])).items(),key=operator.itemgetter(1), reverse=True)[0][0]
        p.append(labels) #.append for the collection of lables
    return p
    

train,validation,test = data_split(X=df,train_size=0.6,test_size=0.4) #split the data as per train-60% and test-40%
y_pred = knn_classify(test,train,k=2) #calculating the predicted values
y_test = [i[-1] for i in test]

accuracy = accuracy_score(y_pred,y_test)
print("Accuracy of the KNN Classifier is : ",accuracy*100) #calculating the accuracy of predicted and actual

Accuracy of the KNN Classifier is :  66.25


### 1.2 Data Analysis

In this section you should read the data. Then split it randomly into train (60%), validation (20%), and test (20%) data. Use the train and validation data to find k-value giving the best classification result. Then use this k-value to classify the test data and report your findings: the k-value and the percentage of correct predictions.

In [48]:
# Splitting the data into train-60%, val-20% and test-20%

train,val,test = data_split(X=df,train_size=0.6,test_size=0.2)

accuracy_list = []
for i in range(1,20):  #iterating between 1 - 20 of k values
    y_pred = knn_classify(val,train,k=i) #Calling Knn_classify to predict the target variable
    y_valid = [i[-1] for i in val]
    accuracy  = accuracy_score(y_pred,y_valid)  #calculating the accuarcy
    accuracy_list.append(accuracy)  # appending all the accuracies
best_k_value = np.argsort(accuracy_list)[-1]+1  #sorting out from the accuarcies
print("The Best K value is:",best_k_value) #printing the values of Best k value

# calculating the test accuracy with the help of best_K_value obtained above

y_test_pred = knn_classify(test,train,k=best_k_value)
y_test = [i[-1] for i in test]
accuracy2 = accuracy_score(y_test_pred,y_test)
print('The accuracy of the tesi is',accuracy2*100)

The Best K value is: 19
The accuracy of the tesi is 62.5


## Part 2 - KNN and linear regression

### 2.1 KNN regression algorithm

In this section you should write a function ``knn_regression(train, test, k)`` that takes train and test data, and a k-value, and returns the regression (fitted) values of the responses of the test data.

In [49]:
#Defining the function to calculate Residual sum of squares

def RSS(y_pred,y_test):
    return(np.mean([(j-i)**2 for i,j in zip(y_pred,y_test)]))

#Defining the Function for KNN regression algorthim
def knn_regression(test,train,k):
    y_pred2 = []
    for i in test: #iterating between the test data
        distances2 = []
        for idx,j in enumerate(train): #iterating between train point
            dist = euclidean_distance(j, i) #calculating euclidean distance
            distances2.append([dist,j])
        distances2 = sorted(distances2, key=lambda x: x[0]) # sorting out by  distances
        neighbors = [i[1] for i in distances2[:k]] # as per given K value choosing neighbors
        lab = sorted(dict(Counter([i[-1] for i in neighbors])).items(),key=operator.itemgetter(1), reverse=True)[0][0]
        y_pred2.append(lab)
    return y_pred2

# Splitting the Data with Train 60% and Test 40%
train2,val2,test2 = data_split(X=df_reg,train_size=0.6,test_size=0.4)

y_pred2 = knn_regression(test,train,k=2) #taking a random K value
y_test2 = [i[-1] for i in test]
print("Residual sum of squares(RSS SCORE) using KNN Regressor : ", RSS(y_pred2,y_test)) #calculating RSS Score

Residual sum of squares(RSS SCORE) using KNN Regressor :  0.075


### 2.2 Linear regression algorithm

In this section you should write a function ``linear_regression(train, test)`` that takes train and test data, and returns linear regression (fitted) values of the responses of the test data. The column-vector of regression values $\hat{\vec y}$ should be computed using this formula:

$$
\hat{\vec y} = X^{(test)} \hat{\vec \beta} 
$$

where 

- $X^{(test)}$ is the test design matrix obtained by stacking together a column of 1's with columns of predictors variables from the test data:

$$
X^{(test)} = \begin{bmatrix} 1 & \vec x^{(test)}_1 & \vec x^{(test)}_2 & \cdots \end{bmatrix} = \begin{bmatrix} 
1 & x^{(test)}_{11} & x^{(test)}_{12} & \cdots \\ 
1 & x^{(test)}_{21} & x^{(test)}_{22} & \cdots \\
\vdots & \vdots & \vdots \\
1 & x^{(test)}_{m1} & x^{(test)}_{m2} & \cdots\end{bmatrix}
$$

- $\hat{\vec \beta}$ is a column vector of least-squares estimates of the regression coefficients:

$$
\hat{\vec \beta} = \big((X^{(train)})^T X^{(train)} \big)^{-1} (X^{(train)})^T \vec y^{(train)}
$$
 
- $X^{(train)}$ is the design matrix for the train data:

$$
X^{(train)} = \begin{bmatrix} 1 & \vec x^{(train)}_1 & \vec x^{(train)}_2 & \cdots \end{bmatrix} = \begin{bmatrix} 
1 & x^{(train)}_{11} & x^{(train)}_{12} & \cdots \\ 
1 & x^{(train)}_{21} & x^{(train)}_{22} & \cdots \\
\vdots & \vdots & \vdots \\
1 & x^{(train)}_{n1} & x^{(train)}_{n2} & \cdots\end{bmatrix}
$$

- $m$ is the number of rows of the test data

- $n$ is the number of rows of the train data

- $\vec y^{(train)}$ is a column-vector of response values of the train data 

In [50]:
#Defining the function for cost function

def cost_f(X, y, P):
    return (np.sum(X.dot(P)-y)**2)/(2*(len(y)))

# Defining Gradient Function for better coefficients

def batch_gradient(X,y,P,learning_rate,iterations):
    cost_hist = list()
    for iteration in range(iterations):
        loss_values = (X.dot(P)-y) # loss clculation
        gradients = ((X.T.dot(loss_values)) / (len(y))) #calculation of the gradients
        P = P - (learning_rate* gradients) #modification of the existing coefficients
        cost_value = cost_f(X, y, P) # calling out cost function for caculating the cost as per X,y and modfied P coefficients
        cost_hist.append(cost_value) # appending the cost value
    return P,cost_hist  # returning the Best coefficents value and Cost value


#Defining the function for  Linear regression 

def Linear_Reg(train, test,iterations=2000, learning_rate =0.005):
    X_train, y_train = train[:,:-1],train[:,-1] #  train data
    X_test, y_test = test[:,:-1],test[:,-1] #test data
    P = np.zeros(X_train.shape[1]) # generating random coefficients
    modif_p,cost = batch_gradient(X_train, y_train, P,learning_rate=learning_rate,iterations=iterations)
    y_pred = X_test.dot(modif_p)  # predicting the test/validation data
    return y_pred,modif_p

#calculations of the predictions with  train-60% and test-40% data 

y_pred,best_p = Linear_Reg(train, test)
y_test = [i[-1] for i in test]

#Metrics choosen is Residual sum of squares
print("RSS (Residual sum of scores) SCORE : ",RSS(y_pred,y_test))

RSS (Residual sum of scores) SCORE :  0.6377750243204184


### 2.3 Data Analysis

In this section you should read the data. Then split it randomly into train (60%), validation (20%), and test (20%) data. Use the train and validation data to find k-value giving the best knn regression result. Then use this k-value to conduct knn regression on the test data and report your findings: the k-value and the [residual sum of squares](https://en.wikipedia.org/wiki/Residual_sum_of_squares): $RSS = \sum_{i=1}^m (\hat{y}_i - y_i)^2$ where $\hat{y}_i$ are predicted values, $y_i$ are observed values, and $m$ is the number of observations in your test data. Then repeat the last step using the linear regression approach. Finally, compare the the two RSS values your have obtained. Which algorithm, knn or linear regression, gives a better result?

In [51]:
#Splitting the dataset according to train-60%,val-20% and test-20%

train,val,test = data_split(X=df_reg,train_size=0.60,test_size=0.2)


residual = []
for i in range(1,20):
    y_pred = knn_regression(val,train,k=i)
    y_values = [i[-1] for i in val]
    r  = RSS(y_pred,y_values) #calculating RSS 
    print(f"RSS for K- {i} : ", RSS(y_pred,y_test)) 
    residual.append(r)# appending the RSS value
best_k_value2 = np.argsort(residual)[0]+1 # taking the postion of loww RSS value
print('-'*20)
print("Best K value is :",best_k_value2) 



RSS for K- 1 :  0.8182400000000001
RSS for K- 2 :  0.8182400000000001
RSS for K- 3 :  0.8182400000000001
RSS for K- 4 :  0.8226275000000001
RSS for K- 5 :  0.7896675
RSS for K- 6 :  0.80395
RSS for K- 7 :  0.8738949999999999
RSS for K- 8 :  0.8533125
RSS for K- 9 :  0.8127675
RSS for K- 10 :  0.8127675
RSS for K- 11 :  0.8009774999999999
RSS for K- 12 :  0.8016174999999999
RSS for K- 13 :  0.748045
RSS for K- 14 :  0.7017325
RSS for K- 15 :  0.6710974999999999
RSS for K- 16 :  0.668635
RSS for K- 17 :  0.668845
RSS for K- 18 :  0.6639225000000001
RSS for K- 19 :  0.6639225000000001
--------------------
Best K value is : 16


In [52]:
 # predicting the test using KNN regression

y_test_predict = knn_regression(test,train,k=best_k_value)
y_test = [i[-1] for i in test]
print("Residual sum of squares(RSS Score) for KNN : ",RSS(y_test_predict,y_test))

Residual sum of squares(RSS Score) for KNN :  0.19098375


In [53]:
learning_rate = [0.3, 0.2, 0.1, 0.01, 0.001 ]

In [54]:
for lr in learning_rate:
    y_hat,best_b = Linear_Reg(train, test, learning_rate = lr)
    y_test = [i[-1] for i in test]
    print("RSS SCORE IS : ",RSS(y_hat,y_test))

RSS SCORE IS :  0.12738081313695132
RSS SCORE IS :  0.1273808131369511
RSS SCORE IS :  0.12738080968898602
RSS SCORE IS :  0.13381289330167426
RSS SCORE IS :  0.3988348272720893


The RSS (Residual sum of squares) value for Linear Regression is almost constant across all learning rate except the last one. After seeing the output, the knn algorithm beats the linear regression algorithm.

---