In [2]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [3]:
# Step 1: Data Retrieval
df = pd.read_csv("https://raw.githubusercontent.com/ingledarshan/upGrad_Darshan/main/student_records.csv")
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No


In [4]:
# Step 2: Data Preparation
# Cleans missing values, errors.
# But our dataset is already clea, so we will mark this step as DONE..

In [5]:
df.columns

Index(['Name', 'OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore',
       'Recommend'],
      dtype='object')

In [6]:
# Step 3: Feature Extraction and Engineering
# 'Name' is not important for predicting whether a student must be given Grant or not.

feature_names = ['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore']
training_features = df[feature_names]

outcome_name = ['Recommend']
outcome_labels = df[outcome_name]

In [7]:
outcome_labels

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [8]:
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,90,85
1,C,N,85,51
2,F,N,10,17
3,B,Y,75,71
4,E,N,20,30
5,A,Y,92,79
6,B,Y,60,59
7,C,Y,75,33


In [9]:
training_features.columns

Index(['OverallGrade', 'Obedient', 'ResearchScore', 'ProjectScore'], dtype='object')

In [10]:
numeric_feature_names = ['ResearchScore', 'ProjectScore']
categorical_feature_names = ['OverallGrade', 'Obedient']

In [11]:
training_features[categorical_feature_names]

Unnamed: 0,OverallGrade,Obedient
0,A,Y
1,C,N
2,F,N
3,B,Y
4,E,N
5,A,Y
6,B,Y
7,C,Y


In [12]:
training_features[numeric_feature_names]

Unnamed: 0,ResearchScore,ProjectScore
0,90,85
1,85,51
2,10,17
3,75,71
4,20,30
5,92,79
6,60,59
7,75,33


In [13]:
training_features[numeric_feature_names].describe()

Unnamed: 0,ResearchScore,ProjectScore
count,8.0,8.0
mean,63.375,53.125
std,31.640559,24.752705
min,10.0,17.0
25%,50.0,32.25
50%,75.0,55.0
75%,86.25,73.0
max,92.0,85.0


In [14]:
researchscorerange = 92-10
researchscorerange

82

In [15]:
projectscorerange = 85-17
projectscorerange

68

In [16]:
training_features[numeric_feature_names]

Unnamed: 0,ResearchScore,ProjectScore
0,90,85
1,85,51
2,10,17
3,75,71
4,20,30
5,92,79
6,60,59
7,75,33


In [17]:
training_features[numeric_feature_names].mean()

ResearchScore    63.375
ProjectScore     53.125
dtype: float64

In [18]:
training_features[numeric_feature_names].std()

ResearchScore    31.640559
ProjectScore     24.752705
dtype: float64

In [19]:
(90 - 63.375) / 31.640559

0.8414832367531813

In [20]:
(85 - 63.375) / 31.640559

0.6834582157666683

In [21]:
np.round(((85 - 53.125) / 24.752705),6)

1.287738

In [22]:
import numpy as np
np.round(((51 - 53.125) / 24.752705),4)

-0.0858

<img src=https://cdn-images-1.medium.com/max/1600/0*PXGPVYIxyI_IEHP7. width="200" height="200">

In [23]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

ss.fit(training_features[numeric_feature_names]) # padh

training_features[numeric_feature_names] = ss.transform(training_features[numeric_feature_names]) # exam de de
training_features[numeric_feature_names]

Unnamed: 0,ResearchScore,ProjectScore
0,0.899583,1.37665
1,0.730648,-0.091777
2,-1.80339,-1.560203
3,0.392776,0.772004
4,-1.465519,-0.998746
5,0.967158,1.117516
6,-0.114032,0.253735
7,0.392776,-0.869179


In [24]:
training_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,A,Y,0.899583,1.37665
1,C,N,0.730648,-0.091777
2,F,N,-1.80339,-1.560203
3,B,Y,0.392776,0.772004
4,E,N,-1.465519,-0.998746
5,A,Y,0.967158,1.117516
6,B,Y,-0.114032,0.253735
7,C,Y,0.392776,-0.869179


In [25]:
categorical_feature_names

['OverallGrade', 'Obedient']

In [26]:
# Engineering Categorical Features
# One Hot Encoding / Dummification -> get_deummies() of Pandas
# Why? Bcz ML model requires QB~AB in Numerical format and we have it in text format
# https://stackoverflow.com/questions/50176096/removing-redundant-columns-when-using-get-dummies

training_features = pd.get_dummies(training_features, columns=categorical_feature_names)
training_features
# This is our QB

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [27]:
outcome_labels
# This is our AB

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [28]:
type(outcome_labels)

pandas.core.frame.DataFrame

In [29]:
outcome_labels['Recommend']

0    Yes
1    Yes
2     No
3     No
4     No
5    Yes
6     No
7     No
Name: Recommend, dtype: object

In [30]:
set(training_features.columns)

# set() -> only keep unique values

{'Obedient_N',
 'Obedient_Y',
 'OverallGrade_A',
 'OverallGrade_B',
 'OverallGrade_C',
 'OverallGrade_E',
 'OverallGrade_F',
 'ProjectScore',
 'ResearchScore'}

In [31]:
set(numeric_feature_names)

{'ProjectScore', 'ResearchScore'}

In [32]:
# get the list of new categorical features

categorical_engineered_features = list(set(training_features.columns) - set(numeric_feature_names))
categorical_engineered_features

['OverallGrade_A',
 'OverallGrade_E',
 'OverallGrade_F',
 'Obedient_Y',
 'Obedient_N',
 'OverallGrade_C',
 'OverallGrade_B']

In [33]:
# Step 4: Modeling
# Logistic Regression

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()

model = lr.fit(training_features, np.array(outcome_labels['Recommend']))

model

LogisticRegression()

In [34]:
# Step 5: Model Evaluation

pred_labels = model.predict(training_features)
actual_labels = np.array(outcome_labels['Recommend'])

In [35]:
pred_labels

array(['Yes', 'Yes', 'No', 'No', 'No', 'Yes', 'No', 'No'], dtype=object)

In [36]:
outcome_labels

Unnamed: 0,Recommend
0,Yes
1,Yes
2,No
3,No
4,No
5,Yes
6,No
7,No


In [37]:
from sklearn.metrics import accuracy_score, classification_report

In [38]:
accuracy_score(actual_labels, pred_labels)

1.0

In [39]:
print(classification_report(actual_labels, pred_labels))

              precision    recall  f1-score   support

          No       1.00      1.00      1.00         5
         Yes       1.00      1.00      1.00         3

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



In [40]:
# Step 6: Deployment

import joblib
import os
if not os.path.exists("Model"):
    os.mkdir("Model")
if not os.path.exists("Scaler"):
    os.mkdir("Scaler")
    
joblib.dump(model, r'Model/model.pickle')
joblib.dump(ss, r'Scaler/scaler.pickle')

['Scaler/scaler.pickle']

# Client Side

In [41]:
# Step 7: Prediction in Action

import joblib
model_darshan = joblib.load(r'Model/model.pickle')
scaler_darshan = joblib.load(r'Scaler/scaler.pickle')

In [42]:
model_darshan

LogisticRegression()

In [43]:
scaler_darshan

StandardScaler()

In [44]:
df

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Henry,A,Y,90,85,Yes
1,John,C,N,85,51,Yes
2,David,F,N,10,17,No
3,Holmes,B,Y,75,71,No
4,Marvin,E,N,20,30,No
5,Simon,A,Y,92,79,Yes
6,Robert,B,Y,60,59,No
7,Trent,C,Y,75,33,No


In [45]:
new_data = pd.DataFrame([
    {'Name' : 'Ninad', 'OverallGrade' : 'F', 'Obedient' : 'N', 'ResearchScore' : 30, 'ProjectScore' : 20},
    {'Name' : 'Thomas', 'OverallGrade' : 'A', 'Obedient' : 'Y', 'ResearchScore' : 78, 'ProjectScore' : 80}
])

In [46]:
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Ninad,F,N,30,20
1,Thomas,A,Y,78,80


In [47]:
categorical_engineered_features

['OverallGrade_A',
 'OverallGrade_E',
 'OverallGrade_F',
 'Obedient_Y',
 'Obedient_N',
 'OverallGrade_C',
 'OverallGrade_B']

In [48]:
prediction_features = new_data[feature_names]
prediction_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,F,N,30,20
1,A,Y,78,80


In [49]:
prediction_features[numeric_feature_names]

Unnamed: 0,ResearchScore,ProjectScore
0,30,20
1,78,80


In [50]:
prediction_features[numeric_feature_names] = scaler_darshan.transform(prediction_features[numeric_feature_names])
prediction_features

Unnamed: 0,OverallGrade,Obedient,ResearchScore,ProjectScore
0,F,N,-1.127647,-1.430636
1,A,Y,0.494137,1.160705


In [51]:
prediction_features = pd.get_dummies(prediction_features, columns=categorical_feature_names)
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.127647,-1.430636,0,1,1,0
1,0.494137,1.160705,1,0,0,1


In [52]:
training_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_B,OverallGrade_C,OverallGrade_E,OverallGrade_F,Obedient_N,Obedient_Y
0,0.899583,1.37665,1,0,0,0,0,0,1
1,0.730648,-0.091777,0,0,1,0,0,1,0
2,-1.80339,-1.560203,0,0,0,0,1,1,0
3,0.392776,0.772004,0,1,0,0,0,0,1
4,-1.465519,-0.998746,0,0,0,1,0,1,0
5,0.967158,1.117516,1,0,0,0,0,0,1
6,-0.114032,0.253735,0,1,0,0,0,0,1
7,0.392776,-0.869179,0,0,1,0,0,0,1


In [53]:
set(prediction_features.columns)

{'Obedient_N',
 'Obedient_Y',
 'OverallGrade_A',
 'OverallGrade_F',
 'ProjectScore',
 'ResearchScore'}

In [54]:
set(numeric_feature_names)

{'ProjectScore', 'ResearchScore'}

In [55]:
categorical_engineered_features

['OverallGrade_A',
 'OverallGrade_E',
 'OverallGrade_F',
 'Obedient_Y',
 'Obedient_N',
 'OverallGrade_C',
 'OverallGrade_B']

In [56]:
current_categorical_engineered_features = set(prediction_features.columns) - set(numeric_feature_names)
current_categorical_engineered_features

{'Obedient_N', 'Obedient_Y', 'OverallGrade_A', 'OverallGrade_F'}

In [57]:
missing_features = set(categorical_engineered_features) - set(current_categorical_engineered_features)
missing_features

{'OverallGrade_B', 'OverallGrade_C', 'OverallGrade_E'}

In [58]:
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y
0,-1.127647,-1.430636,0,1,1,0
1,0.494137,1.160705,1,0,0,1


In [59]:
len(prediction_features)

2

In [60]:
for feature in missing_features:
    prediction_features[feature] = 0 * len(prediction_features)

In [61]:
prediction_features

Unnamed: 0,ResearchScore,ProjectScore,OverallGrade_A,OverallGrade_F,Obedient_N,Obedient_Y,OverallGrade_C,OverallGrade_E,OverallGrade_B
0,-1.127647,-1.430636,0,1,1,0,0,0,0
1,0.494137,1.160705,1,0,0,1,0,0,0


In [62]:
predictions = model_darshan.predict(prediction_features)
predictions

array(['No', 'Yes'], dtype=object)

In [63]:
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore
0,Ninad,F,N,30,20
1,Thomas,A,Y,78,80


In [64]:
new_data['Recommend'] = predictions
new_data

Unnamed: 0,Name,OverallGrade,Obedient,ResearchScore,ProjectScore,Recommend
0,Ninad,F,N,30,20,No
1,Thomas,A,Y,78,80,Yes


# Happy Learning