In [5]:
import pandas as pd
import requests
import io

import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PolynomialFeatures

In [7]:
df = pd.read_csv("dataset.csv")
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [8]:
print("Demension of the dataset:")
df.shape

Demension of the dataset:


(4424, 35)

# Data Description
(just some basic explanation, we can consider copying and paraphrasing the description on kaggle, or just ignore that part. It looks not too important for our project)




|column name | Description |
|--------------------|-------------------|
| | |

# **Data Preprocessing**

In [9]:
df.drop_duplicates(inplace=True)

In [10]:
print("Target : ")
print(df['Target'].unique())

Target : 
['Dropout' 'Graduate' 'Enrolled']


As we can see, there are three different elements in "Target". So, we can assign that: Dropout = 0, Enrolled = 1, Graduate = 2.

In [11]:
class_mapping = {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}
df['Target'] = df['Target'].map(class_mapping)
df.head()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,0
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,2
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,0
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,2
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,2


**Preprocessing one-hot encoding**

By the Data Description, we can know Application order, Age at enrollment, Curricular units 1st sem (credited),
Curricular units 1st sem (enrolled),
Curricular units 1st sem (evaluations),
Curricular units 1st sem (approved), Curricular units 1st sem (grade), Curricular units 1st sem (without evaluations), Curricular units 2nd sem (credited), curricular units 2nd sem (enrolled)	Curricular units 2nd sem (evaluations),	Curricular units 2nd sem (approved),	Curricular units 2nd sem (grade),	Curricular units 2nd sem (without evaluations),	Unemployment rate, Inflation rate and GDP are numerical variables. 

If it is not a numerical variable, then it is a categorical variable.

In order to use one-hot encoding, we need to convert categorical variables into a numerical representation.

Thus, we need to one hot encode those categorical variables.


In [12]:
cats = ['Marital status',	'Application mode', 'Course', 'Daytime/evening attendance',	'Previous qualification', 'Nacionality',	"Mother's qualification",	"Father's qualification",	"Mother's occupation", \
         "Father's occupation", 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender',	'Scholarship holder', 'International'] 
print("Unique elements:")
for cat in cats:
    print(df[cat].unique()) 


Unique elements:
[1 2 4 3 5 6]
[ 8  6  1 12  9 17 15 16 14  4 13  7  3  2  5 18 10 11]
[ 2 11  5 15  3 17 12 10 14 16  6  8 13  9  4  1  7]
[1 0]
[ 1 12 16 14  8  3 15  2  4  9 17 11  6  7 13  5 10]
[ 1 15  3 14 12 18  5 11  8 17  4  9 13 16 10 21  2 20 19  6  7]
[13  1 22 23  3  4 27  2 19 10 25  7  5 24  9 26 18 11 20 21  6  8 17 28
 12 14 16 15 29]
[10  3 27 28  1 14  5  4 24  2 29  9  7 26 18 30 12 15 25 31 16 11 20 33
 13 32  8  6 21 17 34 23 19 22]
[ 6  4 10  8  5  2 16  1  7  3 12  9 20 28 13 29 23 32 30 18 24 19 11 21
 15 27 31 14 22 17 26 25]
[10  4  8 11  6  9  5  2  3 22  7  1 12 39 19 13 29 46 43 34 44 30 41 24
 23 45 35 26 28 36 16 37 31 42 20 15 40 25 21 17 32 38 27 18 14 33]
[1 0]
[0 1]
[0 1]
[1 0]
[1 0]
[0 1]
[0 1]


In [13]:
print(df.columns[df.isna().any()])
df = df.fillna(0)

Index([], dtype='object')


In [14]:
print(df.shape)
display(df.head(5))

(4424, 35)


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nacionality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,0
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,2
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,0
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,2
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,2


In [15]:
df_ohe = df.copy()
df_ohe = pd.get_dummies(df_ohe, columns=cats)
df_ohe

Unnamed: 0,Application order,Age at enrollment,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),...,Debtor_0,Debtor_1,Tuition fees up to date_0,Tuition fees up to date_1,Gender_0,Gender_1,Scholarship holder_0,Scholarship holder_1,International_0,International_1
0,5,20,0,0,0,0,0.000000,0,0,0,...,1,0,0,1,0,1,1,0,1,0
1,1,19,0,6,6,6,14.000000,0,0,6,...,1,0,1,0,0,1,1,0,1,0
2,5,19,0,6,0,0,0.000000,0,0,6,...,1,0,1,0,0,1,1,0,1,0
3,2,20,0,6,8,6,13.428571,0,0,6,...,1,0,0,1,1,0,1,0,1,0
4,1,45,0,6,9,5,12.333333,0,0,6,...,1,0,0,1,1,0,1,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4419,6,19,0,6,7,5,13.600000,0,0,6,...,1,0,0,1,0,1,1,0,1,0
4420,2,18,0,6,6,6,12.000000,0,0,6,...,0,1,1,0,1,0,1,0,0,1
4421,1,30,0,7,8,7,14.912500,0,0,8,...,1,0,0,1,1,0,0,1,1,0
4422,1,20,0,5,5,5,13.800000,0,0,5,...,1,0,0,1,1,0,0,1,1,0


In [16]:
print("Features of the dataset :")
print(df.describe(include = 'all'))

Features of the dataset :
       Marital status  Application mode  Application order       Course  \
count     4424.000000       4424.000000        4424.000000  4424.000000   
mean         1.178571          6.886980           1.727848     9.899186   
std          0.605747          5.298964           1.313793     4.331792   
min          1.000000          1.000000           0.000000     1.000000   
25%          1.000000          1.000000           1.000000     6.000000   
50%          1.000000          8.000000           1.000000    10.000000   
75%          1.000000         12.000000           2.000000    13.000000   
max          6.000000         18.000000           9.000000    17.000000   

       Daytime/evening attendance  Previous qualification  Nacionality  \
count                 4424.000000             4424.000000  4424.000000   
mean                     0.890823                2.531420     1.254521   
std                      0.311897                3.963707     1.748447   
mi

# **Exploratory Data Analysis**

In [None]:
# to do: Leo

# **Polynomial Regression**

In [None]:
# to do: Arya

# **K-Fold Cross Validation**

In [None]:
# to do: 

# **Logistic Regression**

In [18]:
# *split* the data by 8:2
data = df_ohe.copy()
train, test = train_test_split(data, test_size=0.2, random_state=48)

X_train, y_train = train.drop(columns=['Target']) ,train['Target']
X_test, y_test = test.drop(columns=['Target']), test['Target']


In [19]:
#train the model
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

lr_model = LogisticRegression(max_iter=10000)
lr_model.fit(X_train, y_train)
print(classification_report(y_test, lr_model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.79      0.81      0.80       283
           1       0.64      0.31      0.42       161
           2       0.78      0.91      0.84       441

    accuracy                           0.77       885
   macro avg       0.74      0.68      0.69       885
weighted avg       0.76      0.77      0.75       885



In [20]:
y_pred = lr_model.predict(X_test)
print("Accuracy: ",accuracy_score(y_test,y_pred))

Accuracy:  0.7694915254237288


# Feed Forward Neural Network (FNN)

In [21]:
#to do: Srinivatsan