## Imports

In [1]:
#dataframe handling
import pandas as pd
import numpy as np

#counting g and w for ADASYN check
from collections import Counter

#for SMOTE oversampling method
from imblearn.over_sampling import SMOTE

#for adasyn oversampling method
from imblearn.over_sampling import ADASYN

#for splitting data
from sklearn.model_selection import train_test_split

#import model RF
from sklearn.ensemble import RandomForestClassifier

#for grid search
from sklearn.model_selection import GridSearchCV

# grid search  accuracy scoring
from sklearn.metrics import accuracy_score

# for checking accuracy
from sklearn import metrics

#for confusion matrics
from sklearn.metrics import confusion_matrix

In [2]:
#Reading the dataframe
df = pd.read_csv("Student Performance Prediction-Binary.csv")
df = df.drop(["Total [100]","Student ID"], axis = 1)
df.head()

Unnamed: 0,Quiz01 [10],Assignment01 [8],Midterm Exam [20],Assignment02 [12],Assignment03 [25],Final Exam [35],Course Grade,Class
0,95,91,70,90,84,64,85,G
1,85,76,65,61,73,64,76,G
2,85,41,73,61,73,61,73,G
3,80,78,80,79,79,57,80,G
4,85,91,78,80,84,67,85,G


In [3]:
# Assigning the features and target variables
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [4]:
#remove warning
pd.options.mode.chained_assignment = None  # default='warn'

#encoding??
for i in range(len(Y)):

    if Y[i] == 'G':

        Y[i] = 1

    else:

        Y[i] = 0

Y = Y.astype('int')

# Oversampling using SMOTE

In [5]:
smote = SMOTE()
X_S,Y_S = smote.fit_resample(X, Y)
counter= Counter (Y_S)
counter

Counter({1: 465, 0: 465})

### Modelling 20% of the data

In [6]:
X_S_20 = X_S.iloc[:,:2]
X_S_20

Unnamed: 0,Quiz01 [10],Assignment01 [8]
0,95,91
1,85,76
2,85,41
3,80,78
4,85,91
...,...,...
925,59,64
926,0,36
927,83,62
928,79,53


In [7]:
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(X_S_20,Y_S, test_size=0.2,random_state=100)

In [8]:
#random forest classifier and defining the max features (mtry) using params
classifier = RandomForestClassifier()
params = {'max_features':[i for i in range(1,3)]}

In [9]:
#first grid using percision
grid = GridSearchCV(classifier, params, cv=3, scoring= "accuracy", return_train_score=True,verbose=1)
grid_search1 = grid.fit(x_train,y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [10]:
print(grid.best_score_)
print(grid.best_params_)

0.9260752688172044
{'max_features': 1}


In [11]:
y_pred = grid.predict(x_test)
accuracy_score(y_test,y_pred)

0.9408602150537635

In [12]:
con_matrx = confusion_matrix(y_test, y_pred)
print(con_matrx)

[[84  6]
 [ 5 91]]


### Modelling 50% of the data

In [13]:
X_S_50 = X_S.iloc[:,:4]
X_S_50

Unnamed: 0,Quiz01 [10],Assignment01 [8],Midterm Exam [20],Assignment02 [12]
0,95,91,70,90
1,85,76,65,61
2,85,41,73,61
3,80,78,80,79
4,85,91,78,80
...,...,...,...,...
925,59,64,55,68
926,0,36,56,53
927,83,62,67,24
928,79,53,51,49


In [14]:
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(X_S_50,Y_S, test_size=0.2,random_state=100)

In [15]:
#random forest classifier and defining the max features (mtry) using params
classifier = RandomForestClassifier()
params = {'max_features':[i for i in range(1,5)]}

In [16]:
grid = GridSearchCV(classifier, params, cv=3, scoring= "accuracy", return_train_score=True,verbose=1)
grid_search1 = grid.fit(x_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [17]:
print(grid.best_score_)
print(grid.best_params_)

0.989247311827957
{'max_features': 1}


In [18]:
y_pred = grid.predict(x_test)
accuracy_score(y_test,y_pred)

0.9731182795698925

In [19]:
con_matrx = confusion_matrix(y_test, y_pred)
print(con_matrx)

[[88  2]
 [ 3 93]]


# Oversampling using ADASYN

In [20]:
adasyn= ADASYN()
X_A,Y_A= adasyn.fit_resample(X,Y)
counter= Counter (Y_A)
counter

Counter({1: 465, 0: 462})

### Modelling 20% of the data

In [21]:
X_A_20 = X_A.iloc[:,:2]
X_A_20

Unnamed: 0,Quiz01 [10],Assignment01 [8]
0,95,91
1,85,76
2,85,41
3,80,78
4,85,91
...,...,...
922,81,44
923,70,58
924,70,59
925,83,62


In [22]:
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(X_A_20,Y_A, test_size=0.2,random_state=100)

In [23]:
#random forest classifier and defining the max features (mtry) using params
classifier = RandomForestClassifier()
params = {'max_features':[i for i in range(1,3)]}

In [24]:
grid = GridSearchCV(classifier, params, cv=3, scoring= "accuracy", return_train_score=True,verbose=1)
grid_search1 = grid.fit(x_train,y_train)

Fitting 3 folds for each of 2 candidates, totalling 6 fits


In [25]:
print(grid.best_score_)
print(grid.best_params_)

0.9284750337381916
{'max_features': 1}


In [26]:
y_pred = grid.predict(x_test)
accuracy_score(y_test,y_pred)

0.9139784946236559

In [27]:
con_matrx = confusion_matrix(y_test, y_pred)
print(con_matrx)

[[85 11]
 [ 5 85]]


### Modelling 50% of the data

In [28]:
X_A_50 = X_S.iloc[:,:4]
X_S_50

Unnamed: 0,Quiz01 [10],Assignment01 [8],Midterm Exam [20],Assignment02 [12]
0,95,91,70,90
1,85,76,65,61
2,85,41,73,61
3,80,78,80,79
4,85,91,78,80
...,...,...,...,...
925,59,64,55,68
926,0,36,56,53
927,83,62,67,24
928,79,53,51,49


In [29]:
# Split dataset into training set and test set
x_train, x_test, y_train, y_test = train_test_split(X_S_50,Y_S, test_size=0.2,random_state=100)

In [30]:
#random forest classifier and defining the max features (mtry) using params
classifier = RandomForestClassifier()
params = {'max_features':[i for i in range(1,5)] }

In [31]:
grid = GridSearchCV(classifier, params, cv=3, scoring= "accuracy", return_train_score=True,verbose=1)
grid_search1 = grid.fit(x_train,y_train)

Fitting 3 folds for each of 4 candidates, totalling 12 fits


In [32]:
print(grid.best_score_)
print(grid.best_params_)

0.9879032258064516
{'max_features': 1}


In [33]:
y_pred = grid.predict(x_test)
accuracy_score(y_test,y_pred)

0.967741935483871

In [34]:
con_matrx = confusion_matrix(y_test, y_pred)
print(con_matrx)

[[88  2]
 [ 4 92]]
