# Setting Jupyter Notebook Theme

In [1]:
!pip install jupyterthemes



In [2]:
!jt -l

Available Themes: 
   chesterish
   grade3
   gruvboxd
   gruvboxl
   monokai
   oceans16
   onedork
   solarizedd
   solarizedl


In [3]:
!jt -monokai

In [4]:
from jupyterthemes import get_themes
from jupyterthemes.stylefx import set_nb_theme

In [5]:
set_nb_theme('solarizedd')

In [6]:
print("\n\n")






# Import Libraries

In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
%matplotlib inline

# I use scikit-learn library to split data into testing and training dataset (which I plan will use 80% of the dataset as a training dataset and the rest is testing dataset (20%)), reduce data dimension using PCA so that KNN and SVM will classify data very efficiently, and calculate students performance prediction score later on using various Supervised Learning algorithms, which I'll specify below:
import sklearn
from sklearn.model_selection import train_test_split # to split data into train and test data
from sklearn.preprocessing import StandardScaler, normalize # for data standard-scaling and normalization
from sklearn.decomposition import PCA # to reduce data dimension into 2D data
from sklearn.linear_model import LogisticRegression, Perceptron, SGDClassifier # Logistic Regression, Perceptron, Stochastic Gradient Descent
from sklearn.neighbors import KNeighborsClassifier # K-Nearest Neighbours
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.neural_network import MLPClassifier # Neural Network
from sklearn.tree import DecisionTreeClassifier # Decision Tree
from sklearn.svm import SVC, LinearSVC # Support Vector Machine, Linear Support Vector Machine
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier # Random Forest, Gradient Boosting, Bagging, Ada Boost 
from sklearn.metrics import classification_report, confusion_matrix # Confusion Matrix & Classification Report

In [8]:
print("\n")





# Data Overview

In [9]:
student_performance = pd.read_csv("StudentsPerformance.csv")
student_performance

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
...,...,...,...,...,...,...,...,...
995,female,group E,master's degree,standard,completed,88,99,95
996,male,group C,high school,free/reduced,none,62,55,55
997,female,group C,high school,free/reduced,completed,59,71,65
998,female,group D,some college,standard,completed,68,78,77


In [10]:
student_performance.head(50)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75
5,female,group B,associate's degree,standard,none,71,83,78
6,female,group B,some college,standard,completed,88,95,92
7,male,group B,some college,free/reduced,none,40,43,39
8,male,group D,high school,free/reduced,completed,64,64,67
9,female,group B,high school,free/reduced,none,38,60,50


In [11]:
student_performance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   gender                       1000 non-null   object
 1   race/ethnicity               1000 non-null   object
 2   parental level of education  1000 non-null   object
 3   lunch                        1000 non-null   object
 4   test preparation course      1000 non-null   object
 5   math score                   1000 non-null   int64 
 6   reading score                1000 non-null   int64 
 7   writing score                1000 non-null   int64 
dtypes: int64(3), object(5)
memory usage: 62.6+ KB


In [12]:
student_performance.describe()

Unnamed: 0,math score,reading score,writing score
count,1000.0,1000.0,1000.0
mean,66.089,69.169,68.054
std,15.16308,14.600192,15.195657
min,0.0,17.0,10.0
25%,57.0,59.0,57.75
50%,66.0,70.0,69.0
75%,77.0,79.0,79.0
max,100.0,100.0,100.0


In [13]:
student_performance.shape

(1000, 8)

from the dataframe shape, we have eight independent variables (x) we need to consider

In [14]:
print("\n")





to see how many categories are there in each column and the quantity of values each categories have

In [15]:
student_performance["gender"].value_counts()

female    518
male      482
Name: gender, dtype: int64

In [16]:
student_performance["race/ethnicity"].value_counts()

group C    319
group D    262
group B    190
group E    140
group A     89
Name: race/ethnicity, dtype: int64

In [17]:
student_performance["parental level of education"].value_counts()

some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64

In [18]:
student_performance["lunch"].value_counts()

standard        645
free/reduced    355
Name: lunch, dtype: int64

In [19]:
student_performance["test preparation course"].value_counts()

none         642
completed    358
Name: test preparation course, dtype: int64

In [20]:
student_performance["math score"].value_counts()

65    36
62    35
69    32
59    32
61    27
      ..
24     1
26     1
28     1
33     1
0      1
Name: math score, Length: 81, dtype: int64

In [21]:
student_performance["reading score"].value_counts()

72    34
74    33
64    32
67    30
73    30
      ..
26     1
32     1
40     1
23     1
17     1
Name: reading score, Length: 72, dtype: int64

In [22]:
student_performance["writing score"].value_counts()

74    35
70    33
68    31
73    28
80    27
      ..
23     1
28     1
35     1
15     1
10     1
Name: writing score, Length: 77, dtype: int64

In [23]:
print("\n")





# Data Cleaning

In [24]:
student_performance.isnull().sum() # check if there are any missing values in the dataset

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64

since there are no missing values in the dataset, we don't have to do any data cleaning

In [25]:
print("\n")





# Feature Engineering

### Gender

In [26]:
student_performance["gender"].value_counts() # to check how many categories are there and the frequency of each category in the Gender column in the dataset 

female    518
male      482
Name: gender, dtype: int64

From the result above, there are two categories in the Gender column: female and male.

Therefore, I'll categorize female to Category 0 and male to Category 1 in the Gender column.

I start the category from number 0 (the smallest number in positive integer) since it'll give a better prediction score later on when using Supervised Learning algorithms (Logistic Regression, Decision Tree, Neural Network, KNN, Naive Bayes, Support Vector Machine, Linear Support Vector Machine, Random Forest, Gradient Boosting, Ada Boost, Bagging) than starting the category from number 1.

In [27]:
student_dataset = [student_performance]

for data in student_dataset:
    data["gender"] = data["gender"].map({"female": 0, "male": 1}).astype(int)

In [28]:
student_performance.head(10) # check whether female and male in Gender column has been categorized or not

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,group B,bachelor's degree,standard,none,72,72,74
1,0,group C,some college,standard,completed,69,90,88
2,0,group B,master's degree,standard,none,90,95,93
3,1,group A,associate's degree,free/reduced,none,47,57,44
4,1,group C,some college,standard,none,76,78,75
5,0,group B,associate's degree,standard,none,71,83,78
6,0,group B,some college,standard,completed,88,95,92
7,1,group B,some college,free/reduced,none,40,43,39
8,1,group D,high school,free/reduced,completed,64,64,67
9,0,group B,high school,free/reduced,none,38,60,50


In [29]:
student_performance["gender"].value_counts() # check whether female and male in Gender column has been categorized or not

0    518
1    482
Name: gender, dtype: int64

from the dataframe result frequency of each category in the Gender column above, we can see that values in gender column has been integer, which means female and male in Gender column have been categorized with Category 0 as Female and Category 1 as Male

In [30]:
print("\n")





### Race / Ethnicity

In [31]:
student_performance["race/ethnicity"].value_counts() # to check how many categories are there and the frequency of each category in the Race / Ethnicity column in the dataset 

group C    319
group D    262
group B    190
group E    140
group A     89
Name: race/ethnicity, dtype: int64

From the result above, there are five categories in the Race / Ethnicity column: Group C which has the biggest frequency, group D, group B, group E, and group A which has the smallest frequency.

Therefore, I'll categorize group C to Category 0, group D to Category 1, group B to Category 2, group E to Category 3, and group A to Category 4 in the Race / Ethnicity column.

I start the category by 0 (the smallest number in positive integer) since it'll give a better prediction score later on when using Supervised Learning algorithms (Logistic Regression, Decision Tree, Neural Network, KNN, Naive Bayes, Support Vector Machine, Linear Support Vector Machine, Random Forest, Gradient Boosting, Ada Boost, Bagging) than starting the category by 1.

In [32]:
for data in student_dataset:
    data["race/ethnicity"] = data["race/ethnicity"].map({"group C": 0, "group D": 1, "group B": 2, "group E": 3, "group A": 4}).astype(int)

In [33]:
student_performance.head(10) # check whether group C, D, B, E, and A is categorized or not in race/ethnicity column

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,2,bachelor's degree,standard,none,72,72,74
1,0,0,some college,standard,completed,69,90,88
2,0,2,master's degree,standard,none,90,95,93
3,1,4,associate's degree,free/reduced,none,47,57,44
4,1,0,some college,standard,none,76,78,75
5,0,2,associate's degree,standard,none,71,83,78
6,0,2,some college,standard,completed,88,95,92
7,1,2,some college,free/reduced,none,40,43,39
8,1,1,high school,free/reduced,completed,64,64,67
9,0,2,high school,free/reduced,none,38,60,50


In [34]:
student_performance["race/ethnicity"].value_counts() # check whether group C, D, B, E, and A is categorized or not in race/ethnicity column

0    319
1    262
2    190
3    140
4     89
Name: race/ethnicity, dtype: int64

from the dataframe result frequency of each category in the Race / Ethnicity column above, we can see values in integer data type (dtype: int64), which means group C, B, D, E, and A have been categorized in race/ethnicity column

In [35]:
print("\n")





### Parental Level of Education

In [36]:
student_performance["parental level of education"].value_counts()

some college          226
associate's degree    222
high school           196
some high school      179
bachelor's degree     118
master's degree        59
Name: parental level of education, dtype: int64

From the result above, there are six categories in the Parental Level of Education column: some college which has the biggest frequency, associate's degree, high school, some high school, bachelor's degree, and master's degree which has the smallest frequency.

Therefore, I'll categorize some college to Category 0, associate's degree to Category 1, high school to Category 2, some high school to Category 3, bachelor's degree to Category 4, and master's degree to Category 5 in the Parental Level of Education column.

I start the category by 0 (the smallest number in positive integer) since it'll give a better prediction score later on when using Supervised Learning algorithms (Logistic Regression, Decision Tree, Neural Network, KNN, Naive Bayes, Support Vector Machine, Linear Support Vector Machine, Random Forest, Gradient Boosting, Ada Boost, Bagging) than starting the category by 1.

In [37]:
for data in student_dataset:
    data["parental level of education"] = data["parental level of education"].map({"some college": 0, "associate's degree": 1, "high school": 2, "some high school": 3, "bachelor's degree": 4, "master's degree": 5}).astype(int)

In [38]:
student_performance.head(10) # check whether parental level of education is categorized or not

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,2,4,standard,none,72,72,74
1,0,0,0,standard,completed,69,90,88
2,0,2,5,standard,none,90,95,93
3,1,4,1,free/reduced,none,47,57,44
4,1,0,0,standard,none,76,78,75
5,0,2,1,standard,none,71,83,78
6,0,2,0,standard,completed,88,95,92
7,1,2,0,free/reduced,none,40,43,39
8,1,1,2,free/reduced,completed,64,64,67
9,0,2,2,free/reduced,none,38,60,50


In [39]:
student_performance["parental level of education"].value_counts() # check whether parental level of education is categorized or not

0    226
1    222
2    196
3    179
4    118
5     59
Name: parental level of education, dtype: int64

from the dataframe result and frequency of each category in the Parental Level of Education column above, we can see values in integer data type (dtype: int64), which means the parental levels of education have been categorized in Parental Level of Education column

In [40]:
print("\n")





### Lunch

In [41]:
student_performance["lunch"].value_counts()

standard        645
free/reduced    355
Name: lunch, dtype: int64

From the result above, there are two categories in the Lunch column: standard which has the biggest frequency and free/reduced which has the smallest frequency.

Therefore, I'll categorize standard to Category 0 and free/reduced to Category 1 in the Lunch column.

I start the category by 0 (the smallest number in positive integer) since it'll give a better prediction score later on when using Supervised Learning algorithms (Logistic Regression, Decision Tree, Neural Network, KNN, Naive Bayes, Support Vector Machine, Linear Support Vector Machine, Random Forest, Gradient Boosting, Ada Boost, Bagging) than starting the category by 1.

In [42]:
for data in student_dataset:
    data["lunch"] = data["lunch"].map({"standard": 0, "free/reduced": 1}).astype(int)

In [43]:
student_performance.head(10) # check whether standard and free/reduced is categorized or not

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,2,4,0,none,72,72,74
1,0,0,0,0,completed,69,90,88
2,0,2,5,0,none,90,95,93
3,1,4,1,1,none,47,57,44
4,1,0,0,0,none,76,78,75
5,0,2,1,0,none,71,83,78
6,0,2,0,0,completed,88,95,92
7,1,2,0,1,none,40,43,39
8,1,1,2,1,completed,64,64,67
9,0,2,2,1,none,38,60,50


In [44]:
student_performance["lunch"].value_counts() # check whether standard and free/reduced is categorized or not

0    645
1    355
Name: lunch, dtype: int64

from the dataframe result and frequency of each category in Lunch column above, we can see values in integer data type (dtype: int64), which means string values have been categorized in Lunch column

In [45]:
print("\n")





### Test Preparation Course

In [46]:
student_performance["test preparation course"].value_counts()

none         642
completed    358
Name: test preparation course, dtype: int64

From the result above, there are two categories in the Test Preparation Course column: none which has the biggest frequency and completed which has the smallest frequency.

Therefore, I'll categorize none to Category 0 and completed to Category 1 in the Test Preparation Course column.

I start the category by 0 (the smallest number in positive integer) since it'll give a better prediction score later on when using Supervised Learning algorithms (Logistic Regression, Decision Tree, Neural Network, KNN, Naive Bayes, Support Vector Machine, Linear Support Vector Machine, Random Forest, Gradient Boosting, Ada Boost, Bagging) than starting the category by 1.

In [47]:
student_performance["test preparation course"] = student_performance["test preparation course"].map({"none": 0, "completed": 1}).astype(int)

In [48]:
student_performance.head(10) # check whether test preparation course is categorized or not

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,0,2,4,0,0,72,72,74
1,0,0,0,0,1,69,90,88
2,0,2,5,0,0,90,95,93
3,1,4,1,1,0,47,57,44
4,1,0,0,0,0,76,78,75
5,0,2,1,0,0,71,83,78
6,0,2,0,0,1,88,95,92
7,1,2,0,1,0,40,43,39
8,1,1,2,1,1,64,64,67
9,0,2,2,1,0,38,60,50


In [49]:
student_performance["test preparation course"].value_counts() # check whether test preparation course is categorized or not

0    642
1    358
Name: test preparation course, dtype: int64

from the dataframe result and frequency of each category in Lunch column above, we can see values in integer data type (dtype: int64), which means string values have been categorized in Lunch column

In [50]:
print("\n")





### Exam Scores (Math, Reading, Writing)

In [51]:
student_performance["math score"].value_counts()

65    36
62    35
69    32
59    32
61    27
      ..
24     1
26     1
28     1
33     1
0      1
Name: math score, Length: 81, dtype: int64

In [52]:
student_performance["reading score"].value_counts()

72    34
74    33
64    32
67    30
73    30
      ..
26     1
32     1
40     1
23     1
17     1
Name: reading score, Length: 72, dtype: int64

In [53]:
student_performance["writing score"].value_counts()

74    35
70    33
68    31
73    28
80    27
      ..
23     1
28     1
35     1
15     1
10     1
Name: writing score, Length: 77, dtype: int64

In [54]:
scores = ["math score", "reading score", "writing score"]

#### Grade Categorization

In [55]:
# A: 90 - 100: Category 8
# A-: 85 - 89: Category 7
# B+: 80 - 84: Category 6
# B: 75 - 79: Category 5
# B-: 70 - 74: Category 4
# C: 65 - 69: Category 3
# D: 50 - 64: Category 2
# E: 0 - 49: Category 1
# F: 0: Category 0

In [56]:
# led

In [57]:
student_performance["grades"] = (student_performance["math score"] + student_performance["writing score"] + student_performance["reading score"]) / 3

In [58]:
student_performance.head(10)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,grades
0,0,2,4,0,0,72,72,74,72.666667
1,0,0,0,0,1,69,90,88,82.333333
2,0,2,5,0,0,90,95,93,92.666667
3,1,4,1,1,0,47,57,44,49.333333
4,1,0,0,0,0,76,78,75,76.333333
5,0,2,1,0,0,71,83,78,77.333333
6,0,2,0,0,1,88,95,92,91.666667
7,1,2,0,1,0,40,43,39,40.666667
8,1,1,2,1,1,64,64,67,65.0
9,0,2,2,1,0,38,60,50,49.333333


In [59]:
student_list = [student_performance]
for data in student_list:
    data.loc[data["grades"] == 0, "grades"] = 0
    data.loc[(data["grades"] > 0) & (data["grades"] < 50), "grades"] = 1
    data.loc[(data["grades"] >= 50) & (data["grades"] < 65), "grades"] = 2
    data.loc[(data["grades"] >= 65) & (data["grades"] < 70), "grades"] = 3
    data.loc[(data["grades"] >= 70) & (data["grades"] < 75), "grades"] = 4
    data.loc[(data["grades"] >= 75) & (data["grades"] < 80), "grades"] = 5
    data.loc[(data["grades"] >= 80) & (data["grades"] < 85), "grades"] = 6
    data.loc[(data["grades"] >= 85) & (data["grades"] < 90), "grades"] = 7
    data.loc[(data["grades"] >= 90) & (data["grades"] <= 100), "grades"] = 7

In [60]:
"""
for index in scores:
    student_performance[index + "_RANGE"] = 0
    student_performance.loc[student_performance["math score"] == 0, index] = 0
    student_performance.loc[(student_performance["math score"] > 0) & (student_performance["math score"] <= 49), index] = 1
    student_performance.loc[(student_performance["math score"] >= 50) & (student_performance["math score"] <= 64), index] = 2
    student_performance.loc[(student_performance["math score"] >= 65) & (student_performance["math score"] <= 69), index] = 3
    student_performance.loc[(student_performance["math score"] >= 70) & (student_performance["math score"] <= 74), index] = 4
    student_performance.loc[(student_performance["math score"] >= 75) & (student_performance["math score"] <= 79), index] = 5
    student_performance.loc[(student_performance["math score"] >= 80) & (student_performance["math score"] <= 84), index] = 6
    student_performance.loc[(student_performance["math score"] >= 85) & (student_performance["math score"] <= 89), index] = 7
    student_performance.loc[(student_performance["math score"] >= 90) & (student_performance["math score"] <= 100), index] = 8
"""

'\nfor index in scores:\n    student_performance[index + "_RANGE"] = 0\n    student_performance.loc[student_performance["math score"] == 0, index] = 0\n    student_performance.loc[(student_performance["math score"] > 0) & (student_performance["math score"] <= 49), index] = 1\n    student_performance.loc[(student_performance["math score"] >= 50) & (student_performance["math score"] <= 64), index] = 2\n    student_performance.loc[(student_performance["math score"] >= 65) & (student_performance["math score"] <= 69), index] = 3\n    student_performance.loc[(student_performance["math score"] >= 70) & (student_performance["math score"] <= 74), index] = 4\n    student_performance.loc[(student_performance["math score"] >= 75) & (student_performance["math score"] <= 79), index] = 5\n    student_performance.loc[(student_performance["math score"] >= 80) & (student_performance["math score"] <= 84), index] = 6\n    student_performance.loc[(student_performance["math score"] >= 85) & (student_perform

In [61]:
"""
for index in scores:
    if (student_performance[index] == 0):
        student_performance["grades"] = 0 # Grade F
    elif ((student_performance[index] > 0) & (student_performance[index] <= 49)):
        student_performance["grades"] = 1 # Grade E
    elif ((student_performance[index] >= 50) & (student_performance[index] <= 64)):
        student_performance["grades"] = 2 # Grade D
    elif ((student_performance[index] >= 65) & (student_performance[index] <= 69)):
        student_performance["grades"] = 3 # Grade C
    elif ((student_performance[index] >= 70) & (student_performance[index] <= 74)):
        student_performance["grades"] = 4 # Grade B-
    elif ((student_performance[index] >= 75) & (student_performance[index] <= 79)):
        student_performance["grades"] = 5 # Grade B
    elif ((student_performance[index] >= 80) & (student_performance[index] <= 84)):
        student_performance["grades"] = 6 # Grade B+
    elif ((student_performance[index] >= 85) & (student_performance.all[index] <= 89)):
        student_performance["grades"] = 7 # Grade A-
    elif ((student_performance.all[index] >= 90) & (student_performance.all[index] <= 100)):
        student_performance["grades"] = 8 # Grade A
    """

'\nfor index in scores:\n    if (student_performance[index] == 0):\n        student_performance["grades"] = 0 # Grade F\n    elif ((student_performance[index] > 0) & (student_performance[index] <= 49)):\n        student_performance["grades"] = 1 # Grade E\n    elif ((student_performance[index] >= 50) & (student_performance[index] <= 64)):\n        student_performance["grades"] = 2 # Grade D\n    elif ((student_performance[index] >= 65) & (student_performance[index] <= 69)):\n        student_performance["grades"] = 3 # Grade C\n    elif ((student_performance[index] >= 70) & (student_performance[index] <= 74)):\n        student_performance["grades"] = 4 # Grade B-\n    elif ((student_performance[index] >= 75) & (student_performance[index] <= 79)):\n        student_performance["grades"] = 5 # Grade B\n    elif ((student_performance[index] >= 80) & (student_performance[index] <= 84)):\n        student_performance["grades"] = 6 # Grade B+\n    elif ((student_performance[index] >= 85) & (stu

In [62]:
student_performance["grades"].value_counts() # check whether grade is categorized or not in the dataframe

2.0    296
3.0    142
4.0    135
5.0    126
7.0    116
1.0    103
6.0     82
Name: grades, dtype: int64

In [63]:
student_performance.head(10) # check whether math, reading, and writing score is categorized or not in the new column I make and name as "Grade"

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,grades
0,0,2,4,0,0,72,72,74,4.0
1,0,0,0,0,1,69,90,88,6.0
2,0,2,5,0,0,90,95,93,7.0
3,1,4,1,1,0,47,57,44,1.0
4,1,0,0,0,0,76,78,75,5.0
5,0,2,1,0,0,71,83,78,5.0
6,0,2,0,0,1,88,95,92,7.0
7,1,2,0,1,0,40,43,39,1.0
8,1,1,2,1,1,64,64,67,3.0
9,0,2,2,1,0,38,60,50,1.0


In [64]:
student_performance["math score"].value_counts()

65    36
62    35
69    32
59    32
61    27
      ..
24     1
26     1
28     1
33     1
0      1
Name: math score, Length: 81, dtype: int64

In [65]:
student_performance["reading score"].value_counts()

72    34
74    33
64    32
67    30
73    30
      ..
26     1
32     1
40     1
23     1
17     1
Name: reading score, Length: 72, dtype: int64

In [66]:
student_performance["writing score"].value_counts()

74    35
70    33
68    31
73    28
80    27
      ..
23     1
28     1
35     1
15     1
10     1
Name: writing score, Length: 77, dtype: int64

In [67]:
# student_performance.drop("math score_RANGE", axis = 1, inplace = True)
# student_performance.drop("reading score_RANGE", axis = 1, inplace = True)
# student_performance.drop("writing score_RANGE", axis = 1, inplace = True)

In [68]:
# student_performance.head(10)

from the dataframe result and frequency of each category in Test Preparation Course column above, we can see values in integer data type (dtype: int64), which means scores have been categorized in Test Preparation Course column

In [69]:
print("\n")





### Data Classification

In [70]:
x = student_performance.drop(["grades"], axis = 1)
y = student_performance["grades"]

In [71]:
# I'm going to split the dataset into 80% as a training data and 20% (0,2) as a testing data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [72]:
print("\n")





### Data Standard Scaling and Normalization

We use standard scaling as a data preprocessing step applied to independent variables which will help to normalize data (rescale data within a 0 to 1 range) to increase the consistency in data and thus will help the model to predict outputs with a greater accuracy. 

Also, with normalizing data, we can improve the consistency in our data since normalization will make the standard deviation to be 1 and closer to 0.

In [73]:
standardScaler = StandardScaler()
standardScalingXTrain = standardScaler.fit_transform(x_train)
# standardScalingYTrain = standardScaler.fit_transform(y_train)
standardScalingXTest = standardScaler.fit_transform(x_test)

In [74]:
normalizedXTrain = normalize(standardScalingXTrain)
normalizedXTest = normalize(standardScalingXTest)

In [75]:
print("\n\n")






# Logistic Regression

Based on Python Documentation about sklearn.linear_model.LogisticRegression(), there are five solvers we can use in Logistic Regression: newton-cg, saga, sag, liblinear, and lbfgs. So, I try one by one to find the best solver that finds the greatest prediction accuracy score.

In [76]:
logisticRegression = LogisticRegression(solver = "liblinear", max_iter = 100000)
logisticRegression.fit(x_train, y_train)

round(logisticRegression.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Logistic Regression algorithm

50.62

In [77]:
round(logisticRegression.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Logistic Regression algorithm

47.0

In [78]:
logisticRegression2 = LogisticRegression(solver = "saga", max_iter = 100000)
logisticRegression2.fit(x_train, y_train)

round(logisticRegression2.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Logistic Regression algorithm

47.38

In [79]:
round(logisticRegression2.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Logistic Regression algorithm

45.5

In [80]:
logisticRegression3 = LogisticRegression(solver = "sag", max_iter = 10000)
logisticRegression3.fit(x_train, y_train)

round(logisticRegression3.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Logistic Regression algorithm

49.5

In [81]:
round(logisticRegression3.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Logistic Regression algorithm

47.0

In [82]:
logisticRegression4 = LogisticRegression(solver = "newton-cg", max_iter = 1000000)
logisticRegression4.fit(x_train, y_train)

round(logisticRegression4.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Logistic Regression algorithm

99.62

In [83]:
round(logisticRegression4.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Logistic Regression algorithm

99.5

In [84]:
logisticRegression5 = LogisticRegression(solver = "lbfgs", max_iter = 100000)
logisticRegression5.fit(x_train, y_train)

round(logisticRegression5.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Logistic Regression algorithm

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


97.5

In [85]:
round(logisticRegression5.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Logistic Regression algorithm

95.0

### Conclusion: Logistic Regression

##### A. Solver: "liblinear"

Training Data Accuracy: 50.62

Testing Data Accuracy: 47.0

##### B. Solver: "saga"

Training Data Accuracy: 47.38

Testing Data Accuracy: 45.5

##### C. Solver: "sag"

Training Data Accuracy: 49.5

Testing Data Accuracy: 47.0

##### D. Solver: "newton-cg"

Training Data Accuracy: 99.62

Testing Data Accuracy: 99.5

##### E. Solver: "lbfgs"

Training Data Accuracy: 97.5

Testing Data Accuracy: 95.0

In [86]:
print("\n")





As you can see from the prediction accuracy scores above, logistic regression with solver "newton-cg"  has the greatest value of training and testing accuracy scores. Thus, we choose logistic regression with "network-cg" to compare with other Supervised Learning algorithms later on.

In [87]:
print("\n")





# K-Nearest Neighbors

In [88]:
KNN = KNeighborsClassifier(n_neighbors = 26)
KNN.fit(x_train, y_train)

round(KNN.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with K-Nearest Neighbors algorithm

94.0

In [89]:
round(KNN.score(x_train, y_train) * 100, 2) # to print training data accuracy score with K-Nearest Neighbors algorithm

96.38

In [90]:
print("\n")





# Naive Bayes

In [91]:
NB = GaussianNB()
NB.fit(x_train, y_train)

round(NB.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Naive Bayes algorithm

86.75

In [92]:
round(NB.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Naive Bayes algorithm

82.5

In [93]:
print("\n")





# Neural Network

In [94]:
# ADAM has mentioned as the best solver in neural network since ADAM is the best solver which combines the best properties of the AdaGrad and RMSProp algorithms to provide an optimization algorithm that can handle sparse gradients on noisy problems. So, let's try this one..
neuralNetwork = MLPClassifier(hidden_layer_sizes = (2500, ), activation = "relu", solver = "adam", random_state = 1, max_iter = 500)
neuralNetwork.fit(x_train, y_train)

round(neuralNetwork.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Neural Network algorithm

44.5

In [95]:
round(neuralNetwork.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Neural Network algorithm

41.5

In [96]:
neuralNetwork2 = MLPClassifier(hidden_layer_sizes = (2500, ), activation = "relu", solver = "sgd", random_state = 1, max_iter = 500)
neuralNetwork2.fit(x_train, y_train)

round(neuralNetwork2.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Neural Network algorithm

37.88

In [97]:
round(neuralNetwork2.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Neural Network algorithm

35.0

In [98]:
neuralNetwork3 = MLPClassifier(hidden_layer_sizes = (2500, ), activation = "relu", solver = "lbfgs", random_state = 1, max_iter = 500)
neuralNetwork3.fit(x_train, y_train)

round(neuralNetwork3.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Neural Network algorithm

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


48.38

In [99]:
round(neuralNetwork3.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Neural Network algorithm

45.0

### Conclusion: Neural Network

##### Solver: "adam"

Training Data Accuracy: 44.5

Testing Data Accuracy: 41.5

##### Solver: "sgd"

Training Data Accuracy: 37.88

Testing Data Accuracy: 35.0

##### Solver: "lbfgs"

Training Data Accuracy: 48.38

Testing Data Accuracy: 45.0

In [100]:
print("\n")





From the Neural Network prediction accuracy scores above, we can see that Neural Network with solver "lbfgs" has the best prediction accuracy score for both training and testing data. Thus, I take Neural Network with solver "lbfgs" as the best method of all methods in Neural Network.

In [101]:
print("\n")





# Decision Tree

In [102]:
decisionTree = DecisionTreeClassifier()
decisionTree.fit(x_train, y_train)

round(decisionTree.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Decision Tree algorithm

100.0

In [103]:
round(decisionTree.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Decision Tree algorithm

84.0

In [104]:
decisionTree2 = DecisionTreeClassifier(criterion = "entropy", splitter = "random")
decisionTree2.fit(x_train, y_train)

round(decisionTree2.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Decision Tree algorithm

100.0

In [105]:
round(decisionTree2.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Decision Tree algorithm

83.5

In [106]:
print("\n")





### Conclusion: Decision Tree

##### criterion: "gini", splitter: "best"

Training Data Accuracy: 100.0

Testing Data Accuracy: 85.0

##### criterion: "entropy", splitter: "random"

Training Data Accuracy: 100.0

Testing Data Accuracy: 84.5

In [107]:
print("\n")





From the Decision Tree prediction accuracy scores above, we can see that Decision Tree with criterion "gini" and splitter "best" has the best prediction accuracy score for the testing data accuracy score, while both Decision Tree with criterion "gini" and splitter "best" and Decision Tree with criterion "entropy" and splitter "random" have equal training data accuracy score. Thus, I take Decision Tree with criterion "gini" and splitter "best" as the best method of all methods in Decision Tree.

In [108]:
print("\n")





# Support Vector Machine

In [109]:
# RBF Kernel is especially useful for very large number of features

supportVectorMachine = SVC(C = 10, kernel = "rbf", degree = 3, coef0 = 1)
supportVectorMachine.fit(x_train, y_train)

round(supportVectorMachine.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Support Vector Machine algorithm

97.38

In [110]:
round(supportVectorMachine.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Support Vector Machine algorithm

96.0

In [111]:
supportVectorMachine2 = SVC(C = 10, kernel = "linear", degree = 3, coef0 = 1)
supportVectorMachine2.fit(x_train, y_train)

round(supportVectorMachine2.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Support Vector Machine algorithm

100.0

In [112]:
round(supportVectorMachine2.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Support Vector Machine algorithm

99.5

In [113]:
supportVectorMachine3 = SVC(C = 10, kernel = "sigmoid", degree = 3, coef0 = 1)
supportVectorMachine3.fit(x_train, y_train)

round(supportVectorMachine3.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Support Vector Machine algorithm

20.0

In [114]:
round(supportVectorMachine3.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Support Vector Machine algorithm

21.0

In [115]:
supportVectorMachine4 = SVC(C = 10, kernel = "poly", degree = 10, coef0 = 1)
supportVectorMachine4.fit(x_train, y_train)

round(supportVectorMachine4.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Support Vector Machine algorithm

100.0

In [116]:
round(supportVectorMachine4.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Support Vector Machine algorithm

100.0

In [117]:
supportVectorMachine5 = SVC(C = 10, kernel = "rbf", degree = 20, coef0 = 1)
supportVectorMachine5.fit(x_train, y_train)

round(supportVectorMachine5.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Support Vector Machine algorithm

97.38

In [118]:
round(supportVectorMachine5.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Support Vector Machine algorithm

96.0

In [119]:
supportVectorMachine6 = SVC(C = 100, kernel = "linear", degree = 20, coef0 = 1)
supportVectorMachine6.fit(x_train, y_train)

round(supportVectorMachine6.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Support Vector Machine algorithm

100.0

In [120]:
round(supportVectorMachine6.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Support Vector Machine algorithm

99.5

### Conclusion: Support Vector Machine

##### kernel "RBF"

Training Data Accuracy: 97.38

Testing Data Accuracy: 96.0

##### kernel "linear"

Training Data Accuracy: 100.0

Testing Data Accuracy: 99.5

##### kernel "sigmoid"

Training Data Accuracy: 20.0

Testing Data Accuracy: 21.0

##### kernel "poly"

Training Data Accuracy: 100.0

Testing Data Accuracy: 100.0

In [121]:
print("\n")





From the Support Vector Machine prediction accuracy scores above, we can see that Support Vector Machine with kernel "poly" has the best prediction accuracy score for both the testing and training data accuracy score. Hence, I take Support Vector Machine with kernel "poly" as the best method of all methods in Support Vector Machine.

In [122]:
print("\n")





# Linear Support Vector Machine

In [123]:
linearSVM = LinearSVC(C = 1, loss = "squared_hinge", max_iter = 10000000) # I add max iter because it said it fails to converge and thus I have to increase the number of iterations
linearSVM.fit(x_train, y_train)

round(linearSVM.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Linear SVC algorithm

57.75

In [124]:
round(linearSVM.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Linear SVC algorithm

57.5

In [125]:
linearSVM2 = LinearSVC(C = 1, loss = "hinge", max_iter = 100000000)
linearSVM2.fit(x_train, y_train)

round(linearSVM2.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Linear SVC algorithm

44.62

In [126]:
round(linearSVM2.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Linear SVC algorithm

45.5

### Conclusion: Linear Support Vector Machine

##### loss: "squared_hinge"

Training Data Accuracy: 57.75

Testing Data Accuracy: 57.5

##### loss: "hinge"

Training Data Accuracy: 51.88

Testing Data Accuracy: 53.0

In [127]:
print("\n")





From the Linear Support Vector Machine prediction accuracy scores above, we can see that Linear Support Vector Machine with loss "squared_hinge" has the best prediction accuracy score for both the testing and training data accuracy score. Hence, I take Linear Support Vector Machine with loss "squared_hinge" as the best method of all methods in Linear Support Vector Machine.

In [128]:
print("\n")





# Random Forest

In [129]:
randomForest = RandomForestClassifier(n_estimators = 220)
randomForest.fit(x_train, y_train)

round(randomForest.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Random Forest algorithm

100.0

In [130]:
round(randomForest.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Random Forest algorithm

87.0

In [131]:
print("\n")





# Gradient Boosting

In [132]:
gradientBoosting = GradientBoostingClassifier(loss = "deviance", n_estimators = 10000, criterion = "friedman_mse", learning_rate = 1)
gradientBoosting.fit(x_train, y_train)

round(gradientBoosting.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Gradient Boosting algorithm

100.0

In [133]:
round(gradientBoosting.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Gradient Boosting algorithm

87.5

In [134]:
gradientBoosting2 = GradientBoostingClassifier(loss = "deviance", n_estimators = 10000, criterion = "mse", learning_rate = 1)
gradientBoosting2.fit(x_train, y_train)

round(gradientBoosting2.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Gradient Boosting algorithm

100.0

In [135]:
round(gradientBoosting2.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Gradient Boosting algorithm

86.5

In [136]:
# gradientBoosting3 = GradientBoostingClassifier(loss = "deviance", n_estimators = 100, criterion = "mae")
# gradientBoosting3.fit(x_train, y_train)

# round(gradientBoosting3.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Gradient Boosting algorithm

In [137]:
# round(gradientBoosting3.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Gradient Boosting algorithm

### Conclusion: Gradient Boosting

##### criterion "friedman_mse"

Training Data Accuracy: 100.0

Testing Data Accuracy: 87.0

##### criterion "mse"

Training Data Accuracy: 100.0

Testing Data Accuracy: 89.0

##### criterion "mae"

At first, I used it, but due to a very long time to generate the output, I perceive this criterion as less efficient than other criterions. Thus, I only use criterion "friedman_mse" and criterion "mse".

In [138]:
print("\n")





From the Gradient Boosting prediction accuracy scores above, we can see that Gradient Boosting with criterion "mse" has the best prediction accuracy score for both the testing and training data accuracy score. Hence, I take the Gradient Boosting with criterion "mse" as the best method of all methods in Gradient Boosting.

In [139]:
print("\n")





# Bagging

In [140]:
bagging = BaggingClassifier(base_estimator = SVC(C = 1, kernel = "poly", degree = 10, coef0 = 1))
bagging.fit(x_train, y_train)

round(bagging.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Bagging algorithm

100.0

In [141]:
round(bagging.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Bagging algorithm

100.0

In [142]:
bagging2 = BaggingClassifier(base_estimator = DecisionTreeClassifier())
bagging2.fit(x_train, y_train)

round(bagging2.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Bagging algorithm

99.5

In [143]:
round(bagging2.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Bagging algorithm

86.0

### Conclusion: Bagging

##### base_estimator: Support Vector Machine

Training Data Accuracy: 99.88

Testing Data Accuracy: 99.5

##### base_estimator: Decision Tree Classifier

Training Data Accuracy: 99.5

Testing Data Accuracy: 84.0

In [144]:
print("\n")





From the Bagging prediction accuracy scores above, we can see that Bagging with base_estimator Support Vector Machine has the best prediction accuracy score for both the testing and training data accuracy score. Also, Support Vector Machine with kernel "poly" has a greater prediction accuracy score (100.0 and 100.0) than the Decision Tree with criterion "gini" and splitter "best" (100.0 and 85.5) for both training and testing data accuracy scores. Hence, I take the Bagging with base_estimator Support Vector Machine as the best method of all methods in Bagging.

In [145]:
print("\n")





# Ada Boosting

In [146]:
adaBoost = AdaBoostClassifier(base_estimator = GradientBoostingClassifier(loss = "deviance", n_estimators = 10000, learning_rate = 1, criterion = "mse"), n_estimators = 500, learning_rate = 1)
adaBoost.fit(x_train, y_train)

round(adaBoost.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Ada Boost algorithm

87.5

In [147]:
round(adaBoost.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Ada Boost algorithm

100.0

In [148]:
print("\n")





# Perceptron

In [149]:
perceptron = Perceptron(fit_intercept = True, max_iter = 1000000, early_stopping = True, verbose = 10)
perceptron.fit(x_train, y_train)

round(perceptron.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Perceptron algorithm

-- Epoch 1
Norm: 145.92, NNZs: 8, Bias: 28.000000, T: 720, Avg. loss: 630.250000
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 217.74, NNZs: 8, Bias: 56.000000, T: 1440, Avg. loss: 623.398750
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 332.62, NNZs: 8, Bias: 82.000000, T: 2160, Avg. loss: 570.138750
Total training time: 0.01 seconds.
-- Epoch 4
Norm: 415.33, NNZs: 8, Bias: 110.000000, T: 2880, Avg. loss: 592.645000
Total training time: 0.01 seconds.
-- Epoch 5
Norm: 502.90, NNZs: 8, Bias: 136.000000, T: 3600, Avg. loss: 596.550000
Total training time: 0.01 seconds.
-- Epoch 6
Norm: 548.93, NNZs: 8, Bias: 158.000000, T: 4320, Avg. loss: 531.157500
Total training time: 0.01 seconds.
Convergence after 6 epochs took 0.01 seconds
-- Epoch 1
Norm: 169.23, NNZs: 8, Bias: 28.000000, T: 720, Avg. loss: 2249.710000
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 220.39, NNZs: 8, Bias: 56.000000, T: 1440, Avg. loss: 2116.755000
Total training time: 0.00 seconds.
-- Epoch 3
Norm:

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.0s finished


13.0

In [150]:
round(perceptron.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Perceptron algorithm

12.88

In [151]:
print("\n")





# Stochastic Gradient Descent (SGD)

In [152]:
sgd = SGDClassifier()
sgd.fit(x_train, y_train)

round(sgd.score(x_test, y_test) * 100, 2) # to print testing data accuracy score with Stochastic Gradient Descent algorithm

13.0

In [153]:
round(sgd.score(x_train, y_train) * 100, 2) # to print training data accuracy score with Stochastic Gradient Descent algorithm

13.5

In [154]:
print("\n\n")






# Conclusion

In [155]:
"""
a. Logistic Regression (solver: "newton-cg")
- Training Data Accuracy: 50.62
- Testing Data Accuracy: 47.0

b. K-Nearest Neighbors
- Training Data Accuracy: 96.38
- Testing Data Accuracy: 94.0

c. Naive Bayes
- Training Data Accuracy: 86.75
- Testing Data Accuracy: 82.5

d. Neural Network (solver: "lbfgs")
- Training Data Accuracy: 48.38
- Testing Data Accuracy: 45.0

e. Decision Tree (criterion: "gini", splitter: "best")
- Training Data Accuracy: 100.0
- Testing Data Accuracy: 85.5

f. Support Vector Machine (kernel: "poly")

"""

'\na. Logistic Regression (solver: "newton-cg")\n- Training Data Accuracy: 50.62\n- Testing Data Accuracy: 47.0\n\nb. K-Nearest Neighbors\n- Training Data Accuracy: 96.38\n- Testing Data Accuracy: 94.0\n\nc. Naive Bayes\n- Training Data Accuracy: 86.75\n- Testing Data Accuracy: 82.5\n\nd. Neural Network (solver: "lbfgs")\n- Training Data Accuracy: 48.38\n- Testing Data Accuracy: 45.0\n\ne. Decision Tree (criterion: "gini", splitter: "best")\n- Training Data Accuracy: 100.0\n- Testing Data Accuracy: 85.5\n\nf. Support Vector Machine (kernel: "poly")\n\n'

From the prediction accuracy scores above, we can see that Support Vector Machine with kernel "poly" has the best prediction accuracy score among all the Supervised Learning Algorithms with 100.0 for the training data and 100.0 for the testing data. Therefore, I choose Support Vector Machine with kernel "poly" to generate student performance prediction output.

In [156]:
print("\n")





# Student Performance Prediction

#### Support Vector Machine with kernel "poly"

In [157]:
studentPerformanceSVM = SVC(C = 10, kernel = "poly", degree = 10, coef0 = 1.0)
studentPerformanceSVM.fit(x_train, y_train)

SVC(C=10, coef0=1.0, degree=10, kernel='poly')

In [158]:
round(studentPerformanceSVM.score(x_test, y_test) * 100, 2)

100.0

In [159]:
round(studentPerformanceSVM.score(x_test, y_test) * 100, 2)

100.0

In [160]:
yPredict = studentPerformanceSVM.predict(x_test) # to generate the prediction output of Student Performance
yPredict

array([3., 5., 1., 3., 4., 4., 4., 1., 7., 1., 2., 2., 6., 6., 2., 1., 2.,
       7., 2., 5., 2., 2., 5., 2., 1., 2., 2., 2., 2., 6., 6., 7., 7., 2.,
       2., 7., 7., 2., 4., 4., 3., 2., 3., 4., 5., 2., 7., 2., 6., 2., 1.,
       2., 1., 2., 1., 4., 2., 2., 5., 2., 3., 5., 2., 4., 3., 2., 4., 2.,
       3., 2., 5., 6., 5., 3., 2., 3., 4., 4., 1., 5., 5., 7., 3., 6., 3.,
       5., 3., 7., 4., 2., 4., 2., 2., 2., 5., 4., 2., 3., 2., 3., 1., 7.,
       2., 1., 5., 5., 3., 2., 2., 7., 3., 5., 1., 2., 3., 4., 6., 5., 6.,
       2., 2., 5., 7., 6., 5., 2., 4., 5., 4., 2., 3., 3., 1., 6., 6., 2.,
       3., 3., 3., 6., 4., 3., 2., 2., 5., 1., 1., 1., 6., 7., 1., 4., 7.,
       4., 4., 6., 2., 2., 2., 2., 7., 2., 2., 7., 3., 5., 5., 4., 2., 2.,
       2., 3., 1., 2., 7., 3., 3., 2., 7., 5., 3., 6., 3., 3., 5., 3., 7.,
       7., 4., 1., 2., 2., 2., 5., 2., 2., 6., 6., 3., 3.])

In [176]:
confusionMatrix = confusion_matrix(y_test, yPredict)
print(confusionMatrix)

[[19  0  0  0  0  0  0]
 [ 0 62  0  0  0  0  0]
 [ 0  0 33  0  0  0  0]
 [ 0  0  0 23  0  0  0]
 [ 0  0  0  0 25  0  0]
 [ 0  0  0  0  0 18  0]
 [ 0  0  0  0  0  0 20]]


In [177]:
classificationReport = classification_report(y_test, yPredict)
print(classificationReport)

              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        19
         2.0       1.00      1.00      1.00        62
         3.0       1.00      1.00      1.00        33
         4.0       1.00      1.00      1.00        23
         5.0       1.00      1.00      1.00        25
         6.0       1.00      1.00      1.00        18
         7.0       1.00      1.00      1.00        20

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200



I will make a new column in Student Performance Dataframe to display the prediction output of Student Performance

In [161]:
student_performance["Student Performance"] = yPredict

ValueError: Length of values (200) does not match length of index (1000)

In [162]:
yPredictDF = pd.DataFrame(yPredict, columns = ["Student Performance"])
yPredictDF

Unnamed: 0,Student Performance
0,3.0
1,5.0
2,1.0
3,3.0
4,4.0
...,...
195,2.0
196,6.0
197,6.0
198,3.0


In [163]:
yPredictDF.head(50)

Unnamed: 0,Student Performance
0,3.0
1,5.0
2,1.0
3,3.0
4,4.0
5,4.0
6,4.0
7,1.0
8,7.0
9,1.0


from Student Performance Dataframe above, 0 - 8 show categories of grades. 0 means F, 1 means E, 2 means D, 3 means C, 4 means B-, 5 means B, 6 means B+, 7 means A-, and 8 means A

In [None]:
# yPredictList = [yPredict]

# student_performance["Student Performance"] = yPredictList

In [None]:
# yPredictList = [yPredict]

# for index in yPredictList:
    # student_performance["Student Performance"] = 

In [None]:
# studentPerformancePrediction = pd.concat([student_performance, pd.DataFrame({"Student Performance": yPredict})], axis = 1)

In [None]:
# studentPerformancePrediction = pd.concat([student_performance, pd.DataFrame({"Student Performance": yPredict})], index = [:200], axis = 1)

In [166]:
studentPerformanceMerge = [student_performance, yPredictDF] # to merge Student Performance Dataframe

In [167]:
studentPerformanceMergeDF = pd.concat(studentPerformanceMerge)
studentPerformanceMergeDF

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,grades,Student Performance
0,0.0,2.0,4.0,0.0,0.0,72.0,72.0,74.0,4.0,
1,0.0,0.0,0.0,0.0,1.0,69.0,90.0,88.0,6.0,
2,0.0,2.0,5.0,0.0,0.0,90.0,95.0,93.0,7.0,
3,1.0,4.0,1.0,1.0,0.0,47.0,57.0,44.0,1.0,
4,1.0,0.0,0.0,0.0,0.0,76.0,78.0,75.0,5.0,
...,...,...,...,...,...,...,...,...,...,...
195,,,,,,,,,,2.0
196,,,,,,,,,,6.0
197,,,,,,,,,,6.0
198,,,,,,,,,,3.0


In [168]:
studentPerformanceMergeDF.isnull().sum()

gender                          200
race/ethnicity                  200
parental level of education     200
lunch                           200
test preparation course         200
math score                      200
reading score                   200
writing score                   200
grades                          200
Student Performance            1000
dtype: int64

In [169]:
studentPerformanceMergeDF.fillna(value = 0, inplace = True)

In [None]:
# studentPerformanceMergeDF["gender"].fillna(value = studentPerformanceMergeDF["gender"].mean(), inplace = True)

In [None]:
# studentPerformanceMergeDF["race/ethnicity"].fillna(value = studentPerformanceMergeDF["race/ethnicity"].mean(), inplace = True)

In [None]:
# studentPerformanceMergeDF["parental level of education"].fillna(value = studentPerformanceMergeDF["parental level of education"].mean(), inplace = True)

In [None]:
# studentPerformanceMergeDF["lunch"].fillna(value = studentPerformanceMergeDF["lunch"].mean(), inplace = True)

In [None]:
# studentPerformanceMergeDF["test preparation course"].fillna(value = studentPerformanceMergeDF["test preparation course"].mean(), inplace = True)

In [None]:
# studentPerformanceMergeDF["math score"].fillna(value = studentPerformanceMergeDF["math score"].mean(), inplace = True)

In [None]:
# studentPerformanceMergeDF["reading score"].fillna(value = studentPerformanceMergeDF["reading score"].mean(), inplace = True)

In [None]:
# studentPerformanceMergeDF["writing score"].fillna(value = studentPerformanceMergeDF["writing score"].mean(), inplace = True)

In [None]:
# studentPerformanceMergeDF["grades"].fillna(value = studentPerformanceMergeDF["grades"].mean(), inplace = True)

In [None]:
# studentPerformanceMergeDF["Student Performance"].fillna(value = 0, inplace = True)

In [170]:
studentPerformanceMergeDF.head(50) # to see if the missing value in the dataset has been filled or not

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,grades,Student Performance
0,0.0,2.0,4.0,0.0,0.0,72.0,72.0,74.0,4.0,0.0
1,0.0,0.0,0.0,0.0,1.0,69.0,90.0,88.0,6.0,0.0
2,0.0,2.0,5.0,0.0,0.0,90.0,95.0,93.0,7.0,0.0
3,1.0,4.0,1.0,1.0,0.0,47.0,57.0,44.0,1.0,0.0
4,1.0,0.0,0.0,0.0,0.0,76.0,78.0,75.0,5.0,0.0
5,0.0,2.0,1.0,0.0,0.0,71.0,83.0,78.0,5.0,0.0
6,0.0,2.0,0.0,0.0,1.0,88.0,95.0,92.0,7.0,0.0
7,1.0,2.0,0.0,1.0,0.0,40.0,43.0,39.0,1.0,0.0
8,1.0,1.0,2.0,1.0,1.0,64.0,64.0,67.0,3.0,0.0
9,0.0,2.0,2.0,1.0,0.0,38.0,60.0,50.0,1.0,0.0


In [171]:
studentPerformanceMergeDF.tail(50)

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score,grades,Student Performance
150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
152,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.0
153,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
154,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0
156,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
158,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
159,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0


In [172]:
studentPerformanceMergeDF.isnull().sum()

gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
grades                         0
Student Performance            0
dtype: int64

from the dataframe result and NA count method, the missing values in the dataset has been replaced

In [None]:
print("\n")

# Interpreting the Prediction Outputs

In [179]:
# test
test = x_test["gender"]
test

993    0
859    1
298    1
553    1
672    0
      ..
679    1
722    0
215    1
653    0
150    1
Name: gender, Length: 200, dtype: int32

In [181]:
prediction = pd.DataFrame({"Gender": x_test["gender"], "Race/Ethnicity": x_test["race/ethnicity"], "Parental Level of Education": x_test["parental level of education"], "Lunch": x_test["lunch"], "Test Preparation Course": x_test["test preparation course"], "Student Performance": yPredict})

In [182]:
prediction.head(50)

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
993,0,1,4,1,0,3.0
859,1,0,1,1,0,5.0
298,1,0,2,1,1,1.0
553,1,1,0,1,0,3.0
672,0,0,0,0,0,4.0
971,1,0,3,0,1,4.0
27,0,0,4,0,0,4.0
231,1,0,1,0,0,1.0
306,1,3,0,0,1,7.0
706,1,1,2,0,0,1.0


In [193]:
prediction["Student Performance"].value_counts()

2.0    62
3.0    33
5.0    25
4.0    23
7.0    20
1.0    19
6.0    18
Name: Student Performance, dtype: int64

In [183]:
print("\n")





### Failed

In [190]:
fail_predicted = prediction[prediction["Student Performance"] == 0.0]

In [191]:
fail_predicted

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance


In [192]:
fail_predicted.value_counts()

Series([], dtype: int64)

In [194]:
fail_predicted.shape

(0, 6)

The results above means that no students fail.

In [184]:
print("\n")





### E Score

In [195]:
E_predicted = prediction[prediction["Student Performance"] == 1]

In [196]:
E_predicted

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
298,1,0,2,1,1,1.0
231,1,0,1,0,0,1.0
706,1,1,2,0,0,1.0
55,0,0,2,1,0,1.0
883,1,1,4,1,0,1.0
331,1,0,1,0,0,1.0
262,0,0,3,1,0,1.0
862,1,1,4,1,1,1.0
601,0,0,2,0,0,1.0
785,0,2,3,0,1,1.0


In [197]:
E_predicted.value_counts()

Gender  Race/Ethnicity  Parental Level of Education  Lunch  Test Preparation Course  Student Performance
1       0               0                            0      0                        1.0                    2
                        1                            0      0                        1.0                    2
        2               2                            1      0                        1.0                    1
                        0                            1      0                        1.0                    1
        1               4                            1      1                        1.0                    1
                                                            0                        1.0                    1
                        2                            0      0                        1.0                    1
        0               2                            1      1                        1.0                    1
0       0      

In [198]:
E_predicted.shape

(19, 6)

In [199]:
print("\n")





### D Score

In [200]:
D_predicted = prediction[prediction["Student Performance"] == 2]

In [201]:
D_predicted

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
496,0,0,0,0,0,2.0
558,0,2,1,1,0,2.0
578,0,2,0,1,1,2.0
906,1,2,2,0,0,2.0
14,0,4,5,0,0,2.0
...,...,...,...,...,...,...
834,1,2,0,0,0,2.0
832,1,4,4,0,0,2.0
435,1,0,0,1,1,2.0
769,1,4,0,1,0,2.0


In [202]:
D_predicted.head(50)

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
496,0,0,0,0,0,2.0
558,0,2,1,1,0,2.0
578,0,2,0,1,1,2.0
906,1,2,2,0,0,2.0
14,0,4,5,0,0,2.0
31,0,2,0,0,0,2.0
481,0,1,1,1,0,2.0
311,1,2,4,0,0,2.0
788,1,0,1,1,0,2.0
45,1,2,1,0,0,2.0


In [205]:
D_predicted.tail(12)

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
986,0,0,1,0,0,2.0
688,1,4,2,1,0,2.0
351,1,3,0,0,0,2.0
418,1,1,0,0,0,2.0
945,0,0,1,0,0,2.0
826,0,0,1,1,1,2.0
648,0,2,2,0,0,2.0
834,1,2,0,0,0,2.0
832,1,4,4,0,0,2.0
435,1,0,0,1,1,2.0


In [206]:
D_predicted.value_counts()

Gender  Race/Ethnicity  Parental Level of Education  Lunch  Test Preparation Course  Student Performance
0       1               1                            1      0                        2.0                    4
        2               2                            0      0                        2.0                    3
        0               1                            0      0                        2.0                    3
                        0                            0      0                        2.0                    2
1       1               0                            0      0                        2.0                    2
        0               2                            0      0                        2.0                    2
0       0               4                            1      0                        2.0                    2
        2               1                            1      0                        2.0                    2
1       0      

In [207]:
D_predicted.shape

(62, 6)

In [208]:
print("\n")





### C Score

In [209]:
C_predicted = prediction[prediction["Student Performance"] == 3]

In [210]:
C_predicted

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
993,0,1,4,1,0,3.0
553,1,1,0,1,0,3.0
230,1,1,0,0,0,3.0
635,1,4,2,0,0,3.0
569,1,1,4,1,0,3.0
413,1,2,3,0,1,3.0
252,0,2,3,0,1,3.0
247,0,2,2,0,1,3.0
412,1,1,1,0,0,3.0
386,0,3,4,0,0,3.0


In [211]:
C_predicted.value_counts()

Gender  Race/Ethnicity  Parental Level of Education  Lunch  Test Preparation Course  Student Performance
1       1               4                            1      0                        3.0                    2
                        2                            1      1                        3.0                    2
                        0                            0      0                        3.0                    2
0       0               0                            1      1                        3.0                    1
1       1               1                            0      0                        3.0                    1
        4               2                            1      1                        3.0                    1
                                                     0      0                        3.0                    1
        3               4                            0      1                        3.0                    1
               

In [212]:
C_predicted.shape

(33, 6)

In [213]:
print("\n")





### B- Score

In [214]:
BMinus_predicted = prediction[prediction["Student Performance"] == 4]

In [215]:
BMinus_predicted

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
672,0,0,0,0,0,4.0
971,1,0,3,0,1,4.0
27,0,0,4,0,0,4.0
918,0,0,1,0,1,4.0
267,0,1,2,0,0,4.0
698,0,1,1,0,1,4.0
582,0,1,4,1,0,4.0
202,1,0,1,0,0,4.0
264,1,1,2,0,0,4.0
389,1,1,5,0,0,4.0


In [216]:
BMinus_predicted.value_counts()

Gender  Race/Ethnicity  Parental Level of Education  Lunch  Test Preparation Course  Student Performance
1       1               2                            0      0                        4.0                    2
        0               2                            0      0                        4.0                    2
0       0               0                            0      0                        4.0                    1
        3               0                            0      0                        4.0                    1
1       1               5                            0      0                        4.0                    1
                        4                            0      1                        4.0                    1
                        3                            0      1                        4.0                    1
                                                            0                        4.0                    1
        0      

In [217]:
BMinus_predicted.shape

(23, 6)

In [None]:
print("\n")

### B Score

In [218]:
B_predicted = prediction[prediction["Student Performance"] == 5]

In [219]:
B_predicted

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
859,1,0,1,1,0,5.0
77,1,4,4,0,1,5.0
310,0,2,1,0,0,5.0
251,0,1,0,1,0,5.0
987,1,3,3,0,1,5.0
643,0,3,2,0,1,5.0
738,1,1,1,0,0,5.0
740,1,1,4,0,0,5.0
654,0,2,3,0,0,5.0
261,1,0,0,0,1,5.0


In [220]:
B_predicted.value_counts()

Gender  Race/Ethnicity  Parental Level of Education  Lunch  Test Preparation Course  Student Performance
0       1               0                            1      1                        5.0                    2
        2               1                            0      0                        5.0                    2
        0               2                            1      1                        5.0                    1
1       0               1                            0      0                        5.0                    1
        3               4                            1      1                        5.0                    1
                        3                            0      1                        5.0                    1
                        0                            0      0                        5.0                    1
        1               4                            0      0                        5.0                    1
               

In [221]:
B_predicted.shape

(25, 6)

In [None]:
print("\n")

### B+ Score

In [222]:
BPlus_predicted = prediction[prediction["Student Performance"] == 6]

In [223]:
BPlus_predicted

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
784,1,0,4,0,1,6.0
239,1,0,1,0,0,6.0
1,0,0,0,0,1,6.0
823,0,2,2,1,0,6.0
316,0,1,5,0,1,6.0
299,1,1,1,1,0,6.0
982,1,2,3,0,1,6.0
545,1,3,3,1,1,6.0
489,1,4,1,1,1,6.0
946,1,2,2,0,0,6.0


In [224]:
BPlus_predicted.value_counts()

Gender  Race/Ethnicity  Parental Level of Education  Lunch  Test Preparation Course  Student Performance
1       2               3                            0      1                        6.0                    2
0       0               0                            0      1                        6.0                    1
1       1               0                            0      0                        6.0                    1
        3               3                            1      1                        6.0                    1
                        2                            1      1                        6.0                    1
        2               2                            0      0                        6.0                    1
        1               5                            0      0                        6.0                    1
                        1                            1      0                        6.0                    1
        0      

In [225]:
BPlus_predicted.shape

(18, 6)

In [226]:
print("\n")





### A- Score

In [227]:
AMinus_predicted = prediction[prediction["Student Performance"] == 7]

In [228]:
AMinus_predicted

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance
306,1,3,0,0,1,7.0
175,0,0,5,0,1,7.0
710,1,0,0,0,1,7.0
614,0,4,1,0,0,7.0
736,1,0,1,0,0,7.0
957,0,1,5,0,0,7.0
819,0,0,3,0,0,7.0
456,0,1,4,0,0,7.0
501,0,2,1,0,1,7.0
873,1,3,1,1,0,7.0


In [229]:
AMinus_predicted.value_counts()

Gender  Race/Ethnicity  Parental Level of Education  Lunch  Test Preparation Course  Student Performance
1       0               1                            0      0                        7.0                    2
0       0               0                            0      0                        7.0                    1
        3               1                            0      1                        7.0                    1
1       3               1                            1      0                        7.0                    1
                        0                            0      1                        7.0                    1
                                                            0                        7.0                    1
        0               0                            0      1                        7.0                    1
0       4               1                            0      0                        7.0                    1
        3      

In [230]:
AMinus_predicted.shape

(20, 6)

In [231]:
print("\n")





### A Score

In [232]:
A_predicted = prediction[prediction["Student Performance"] == 8]

In [233]:
A_predicted

Unnamed: 0,Gender,Race/Ethnicity,Parental Level of Education,Lunch,Test Preparation Course,Student Performance


In [234]:
A_predicted.value_counts()

Series([], dtype: int64)

In [235]:
A_predicted.shape

(0, 6)

the results above means no student get A score.

In [186]:
print("\n\n")






In [None]:
print("\n\n")