# Let's train Our Models

In [1]:
import sklearn
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.neighbors import KNeighborsRegressor as KNR
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Lasso
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Get the Data

In [2]:
training_data = pd.read_csv('./data_science/Admission_Predict_Ver1.1.csv')
test_data = pd.read_csv('./data_science/Admission_Predict.csv')

In [3]:
training_data

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
495,496,332,108,5,4.5,4.0,9.02,1,0.87
496,497,337,117,5,5.0,5.0,9.87,1,0.96
497,498,330,120,5,4.5,5.0,9.56,1,0.93
498,499,312,103,4,4.0,5.0,8.43,0,0.73


In [4]:
test_data

Unnamed: 0,Serial No.,GRE Score,TOEFL Score,University Rating,SOP,LOR,CGPA,Research,Chance of Admit
0,1,337,118,4,4.5,4.5,9.65,1,0.92
1,2,324,107,4,4.0,4.5,8.87,1,0.76
2,3,316,104,3,3.0,3.5,8.00,1,0.72
3,4,322,110,3,3.5,2.5,8.67,1,0.80
4,5,314,103,2,2.0,3.0,8.21,0,0.65
...,...,...,...,...,...,...,...,...,...
395,396,324,110,3,3.5,3.5,9.04,1,0.82
396,397,325,107,3,3.0,3.5,9.11,1,0.84
397,398,330,116,4,5.0,4.5,9.45,1,0.91
398,399,312,103,3,3.5,4.0,8.78,0,0.67


### Split the data

In [5]:
X = training_data.drop(['Chance of Admit '], axis=1)
y = training_data['Chance of Admit ']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, shuffle=False)

# Train our Models

From the last nb we know that KNR and Lasso are the best models, hence why we will be fitting and training with these two models...

Let's see what we get...

## Fit The KN Model

In [7]:
n_neighbors = list(np.arange(1,6))
weights = ['uniform','distance']
algorithm = ['auto','ball_tree','kd_tree','brute']
metric =['euclidean','manhattan','chebyshev','minkowski']
p =[1,2]
leaf_size = list(np.arange(20,200,40))
random_grid = {'n_neighbors':n_neighbors,'weights':weights,'p':p,'leaf_size':leaf_size,'algorithm':algorithm,'metric':metric}
print(random_grid)

{'n_neighbors': [1, 2, 3, 4, 5], 'weights': ['uniform', 'distance'], 'p': [1, 2], 'leaf_size': [20, 60, 100, 140, 180], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}


In [8]:
rf = KNR()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 5, verbose=2, random_state=42, n_jobs = -1)
rf_random.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  29 tasks      | elapsed:   11.7s
[Parallel(n_jobs=-1)]: Done 462 tasks      | elapsed:   12.9s
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed:   13.1s finished


RandomizedSearchCV(cv=5, error_score=nan,
                   estimator=KNeighborsRegressor(algorithm='auto', leaf_size=30,
                                                 metric='minkowski',
                                                 metric_params=None,
                                                 n_jobs=None, n_neighbors=5,
                                                 p=2, weights='uniform'),
                   iid='deprecated', n_iter=100, n_jobs=-1,
                   param_distributions={'algorithm': ['auto', 'ball_tree',
                                                      'kd_tree', 'brute'],
                                        'leaf_size': [20, 60, 100, 140, 180],
                                        'metric': ['euclidean', 'manhattan',
                                                   'chebyshev', 'minkowski'],
                                        'n_neighbors': [1, 2, 3, 4, 5],
                                        'p': [1, 2],
                    

In [9]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} degrees.'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy

In [10]:
score = evaluate(rf_random, X_test, y_test)

Model Performance
Average Error: 0.0697 degrees.
Accuracy = 89.75%.


As you can see our last model has good score at 89.75% which should mean the model can predict with a good degree of confidence.


## Predict Model -  KNR

In [11]:
testing = test_data.drop(['Chance of Admit '], axis=1)
testing_answer = test_data['Chance of Admit ']

In [12]:
try_this = testing.values.tolist()[100]

In [14]:
answer = rf_random.predict(np.array(try_this).reshape(1, -1))
print("You have a",round((answer[0]*100),2),"% chance of entry")

You have a 71.0 % chance of entry


In [13]:
testing_answer[100]

0.71

# Fit Lasso

Let's see what we get with Lasso

In [15]:
n_neighbors = list(np.arange(1,6))
weights = ['uniform','distance']
algorithm = ['auto','ball_tree','kd_tree','brute']
metric =['euclidean','manhattan','chebyshev','minkowski']
p =[1,2]
leaf_size = list(np.arange(20,200,40))
random_grid = {'n_neighbors':n_neighbors,'weights':weights,'p':p,'leaf_size':leaf_size,'algorithm':algorithm,'metric':metric}
print(random_grid)

{'n_neighbors': [1, 2, 3, 4, 5], 'weights': ['uniform', 'distance'], 'p': [1, 2], 'leaf_size': [20, 60, 100, 140, 180], 'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'], 'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']}


In [16]:
lasso = Lasso()
parameters = {'alpha':[1e-15,1e-10,1e-8,1e-4,1e-3,1e-2,1,5,10,20]}
lasso_regressor = GridSearchCV(lasso,parameters,scoring = 'neg_mean_squared_error', cv=5)
lasso_regressor.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'alpha': [1e-15, 1e-10, 1e-08, 0.0001, 0.001, 0.01, 1,
                                   5, 10, 20]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='neg_mean_squared_error', verbose=0)

In [22]:
score = evaluate(lasso_regressor, X_test, y_test)

Model Performance
Average Error: 0.0565 degrees.
Accuracy = 90.86%.


From our accuracy checker, lasso is better than KNR with this set of fitted data, which is odd as the anlysis before shows that KNR is better...

My guess would be that it comes down to the number of iterations...


# Predict model - Lasso

In [18]:
testing = test_data.drop(['Chance of Admit '], axis=1)
testing_answer = test_data['Chance of Admit ']

In [19]:
try_this = testing.values.tolist()[211]

In [20]:
answer = lasso_regressor.predict(np.array(try_this).reshape(1, -1))
print("You have a",round((answer[0]*100),2),"% chance of entry")

You have a 84.55 % chance of entry


In [21]:
testing_answer[211]

0.82

Lasso is close but not as accurate as KNR... not sure as to why there is a discrepency between the accuracy and it's ability to predict