In [1]:
import pandas as pd
import numpy as np
from scipy import spatial
import seaborn as sns

In [2]:
def safe_div(x,y):
    if y == 0:
        return 0
    return x / y

In [3]:
def euclidean(v1,v2):
    ary = spatial.distance.cdist(v1,v2, metric='minkowski')
    return ary[0,0]

In [4]:
def distances(dataset,sample):
    dist = []
    l = len(dataset)
    for i in range(l):
        dist.append(euclidean(dataset.iloc[[i]],sample))
    return np.asarray(dist)

In [5]:
dataset = pd.read_csv("../input_data/AdmissionDataset/data.csv")

In [6]:
# Adding bias vector to the dataset with values 1

In [7]:
dataset.insert(loc=0, column='intercept', value=np.ones(len(dataset)))

In [8]:
def continious_to_categorical(data):
    median = data.median()
    temp = []
    for v in data:
        if v < median:
            temp.append(0)
        else:
            temp.append(1)
    return pd.Series(temp)

In [9]:
dataset['Chance of Admit '] = continious_to_categorical(dataset['Chance of Admit '])

In [10]:
dataset.head()

Unnamed: 0,intercept,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1.0,242,317,103,2,2.5,2.0,8.15,0,0
1,1.0,334,319,108,3,3.0,3.5,8.54,1,0
2,1.0,4,322,110,3,3.5,2.5,8.67,1,1
3,1.0,45,326,113,5,4.5,4.0,9.4,1,1
4,1.0,232,319,106,3,3.5,2.5,8.33,1,1


In [11]:
def train_validate_test_split(dataset):
    size = len(dataset)
    tsize = int(size*0.6)
    vsize = int(size*0.8)
    training_data = dataset.iloc[:tsize].reset_index(drop=True)
    validation_data = dataset.iloc[tsize:vsize].reset_index(drop=True)
    testing_data = dataset.iloc[vsize:].reset_index(drop=True)
    return training_data,validation_data,testing_data

In [12]:
def predict(X,weights):
    values = np.dot(X,weights)
    for i in range(len(values)):
        if (1/(1+np.exp(-1 * values[i]))) > 0.5:
            values[i] = 1
        else:
            values[i] = 0
    return values

In [13]:
def meansquareerror(X,weights,y):
    preds = predict(X,weights)
    return ((preds - y)**2).mean()

In [14]:
def logisticRegressionGD(X,y,lr=0.001,iter=60):
    weights = np.random.rand(len(X.columns))
    for i in range(iter):
        preds = predict(X,weights)
        weights[0] = weights[0] - lr *(preds - y).mean()
        for j in range(1,len(X.columns)):
            temp = X.iloc[:, [j]].values
            x = np.reshape(temp,temp.shape[0])
            weights[j] = weights[j] - (lr *((preds - y)* x).mean())
#         print(cost)
    return weights

In [15]:
def confusionmatrix(preds,y,classes):
    n = len(preds)
    noc = len(classes)
    matrix = np.zeros((noc,noc))
    for i in range(n):
        r = classes.index(preds[i])
        c = classes.index(y[i])
        matrix[r][c] += 1
    return matrix

In [16]:
def stats(confusionmatrix,classes): 
    n = len(classes)
    precision = np.zeros(n)
    recall = np.zeros(n)
    f1 = np.zeros(n)
    colsums = confusionmatrix.sum(axis=0)
    rowsums = confusionmatrix.sum(axis=1)
    dval = 0
    for i in range(n):
        precision[i] = confusionmatrix[i,i]/colsums[i]
        recall[i] = confusionmatrix[i,i]/rowsums[i]
        f1[i] = safe_div(2,(safe_div(1,precision[i]))+safe_div(1,recall[i]))
        dval += confusionmatrix[i,i]
    return dval/np.sum(confusionmatrix)

In [17]:
training_data,validation_data,testing_data = train_validate_test_split(dataset)
X = training_data.drop('Chance of Admit ',axis=1)
y = training_data['Chance of Admit ']
# for costf in costfunctions:
weights= logisticRegressionGD(X,y)
print("Training Data Accuracy:")
preds = predict(training_data.drop('Chance of Admit ',axis = 1),weights)
cm = confusionmatrix(preds,training_data['Chance of Admit '],[0,1])
print(stats(cm,[0,1]))
print("\nValidation Data Accuracy")
preds = predict(validation_data.drop('Chance of Admit ',axis = 1),weights)
cm = confusionmatrix(preds,validation_data['Chance of Admit '],[0,1])
print(stats(cm,[0,1]))
print("\nTesting Data Accuracy")
preds = predict(testing_data.drop('Chance of Admit ',axis = 1),weights)
cm = confusionmatrix(preds,testing_data['Chance of Admit '],[0,1])
print(stats(cm,[0,1]))


Training Data Accuracy:
0.5370370370370371

Validation Data Accuracy
0.5222222222222223

Testing Data Accuracy
0.5222222222222223


  # This is added back by InteractiveShellApp.init_path()


In [18]:
def knn_algorithm(training_data,test_data,classes,k):
    ttrain = training_data[['intercept', 'Serial No.', 'GRE Score', 'TOEFL Score',
       'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research']]
    ttest = test_data[['intercept', 'Serial No.', 'GRE Score', 'TOEFL Score',
       'University Rating', 'SOP', 'LOR ', 'CGPA', 'Research']]
    y = training_data['Chance of Admit ']
    pred = []
    for i in range(len(ttest)):
        pred.append(knn(ttrain,ttest.iloc[[i]],y,classes,k))
    return pred

In [19]:
def knn(dataset,sample,y,classes,k):
    dist = distances(dataset,sample)
    indices = dist.argsort()[:3]
    counts = np.zeros(len(classes))
    for i in indices:
        counts[classes.index(y.iloc[i])] += 1
    return classes[np.argmax(counts)]

In [20]:
preds = knn_algorithm(training_data,training_data,[0,1],3)
cm = confusionmatrix(preds,list(training_data['Chance of Admit ']),list(training_data['Chance of Admit '].unique()))
print("Training Data Stats")
stats(cm,training_data['Chance of Admit '].unique())

Training Data Stats


0.9407407407407408

In [21]:
preds = knn_algorithm(training_data,testing_data,[0,1],3)
cm = confusionmatrix(preds,list(testing_data['Chance of Admit ']),list(training_data['Chance of Admit '].unique()))
print("Testing Data Stats")
stats(cm,training_data['Chance of Admit '].unique())

Testing Data Stats


0.7777777777777778

In [22]:
preds = knn_algorithm(training_data,validation_data,[0,1],3)
cm = confusionmatrix(preds,list(validation_data['Chance of Admit ']),list(training_data['Chance of Admit '].unique()))
print("Validation Data Stats")
stats(cm,training_data['Chance of Admit '].unique())

Validation Data Stats


0.8444444444444444