### Inspect Data

In [16]:
import pandas as pd
import numpy as np

dataset = pd.read_csv("data.csv", sep=';')

print("Target : ")
print(dataset['Target'].unique())

print("Dimensions of the dataset : ", dataset.shape)

print('\nNumber of samples for each flower species:')
print(dataset["Target"].value_counts())

Target : 
['Dropout' 'Graduate' 'Enrolled']
Dimensions of the dataset :  (4424, 37)

Number of samples for each flower species:
Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64


In [17]:
# check if there is any missing value
missing_values = dataset.isnull().sum()
if missing_values.any():
    print("\nThere are missing values in the dataset.")
else:
    print("\nNo missing values found in the dataset.")


No missing values found in the dataset.


In [18]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697) 
  
# data (as pandas dataframes) 
X_2 = predict_students_dropout_and_academic_success.data.features 
y_2 = predict_students_dropout_and_academic_success.data.targets 
  
# metadata 
print(predict_students_dropout_and_academic_success.metadata) 
  
# variable information 
display(predict_students_dropout_and_academic_success.variables) 

{'uci_id': 697, 'name': "Predict Students' Dropout and Academic Success", 'repository_url': 'https://archive.ics.uci.edu/dataset/697/predict+students+dropout+and+academic+success', 'data_url': 'https://archive.ics.uci.edu/static/public/697/data.csv', 'abstract': "A dataset created from a higher education institution (acquired from several disjoint databases) related to students enrolled in different undergraduate degrees, such as agronomy, design, education, nursing, journalism, management, social service, and technologies.\nThe dataset includes information known at the time of student enrollment (academic path, demographics, and social-economic factors) and the students' academic performance at the end of the first and second semesters. \nThe data is used to build classification models to predict students' dropout and academic sucess. The problem is formulated as a three category classification task, in which there is a strong imbalance towards one of the classes.", 'area': 'Social Sc

Unnamed: 0,name,role,type,demographic,description,units,missing_values
0,Marital Status,Feature,Integer,Marital Status,1 – single 2 – married 3 – widower 4 – divorce...,,no
1,Application mode,Feature,Integer,,1 - 1st phase - general contingent 2 - Ordinan...,,no
2,Application order,Feature,Integer,,Application order (between 0 - first choice; a...,,no
3,Course,Feature,Integer,,33 - Biofuel Production Technologies 171 - Ani...,,no
4,Daytime/evening attendance,Feature,Integer,,1 – daytime 0 - evening,,no
5,Previous qualification,Feature,Integer,Education Level,1 - Secondary education 2 - Higher education -...,,no
6,Previous qualification (grade),Feature,Continuous,,Grade of previous qualification (between 0 and...,,no
7,Nacionality,Feature,Integer,Nationality,1 - Portuguese; 2 - German; 6 - Spanish; 11 - ...,,no
8,Mother's qualification,Feature,Integer,Education Level,1 - Secondary Education - 12th Year of Schooli...,,no
9,Father's qualification,Feature,Integer,Education Level,1 - Secondary Education - 12th Year of Schooli...,,no


## Split data to training set and testing set

In [19]:
from sklearn.model_selection import train_test_split
# X = dataset.drop('Target', axis = 1)
X = dataset[["Mother's occupation", "Father's occupation", "Debtor", "Scholarship holder"]]
y = dataset['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Try Different Models

## Decision Tree Classifier

In [20]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree

clf = DecisionTreeClassifier(random_state=1234)
dtree_model = clf.fit(X_train, y_train)
dtree_predictions = clf.predict(X_test)
  
cm = confusion_matrix(y_test, dtree_predictions)
print(cm)

true_positives = np.diag(cm).sum()
total = cm.sum()

# Calculate accuracy
accuracy = true_positives / total

print("Accuracy:", accuracy)



[[ 37   8  89]
 [ 11  10  59]
 [ 21   4 204]]
Accuracy: 0.5665914221218962


## Neural Network

In [21]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

X = dataset.drop('Target', axis = 1)
y = dataset['Target']

# normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)

categories = [['Enrolled', 'Graduate', 'Dropout']]
encoder = OneHotEncoder(categories=categories, sparse_output=False)
y = encoder.fit_transform(y.values.reshape(-1,1))

data_train, data_test, class_train, class_test = train_test_split(X, y, test_size=0.1)
mlp = MLPClassifier(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.4, batch_size = 100, hidden_layer_sizes = (23, 17, 12), max_iter = 500)
mlp.fit(data_train, class_train)
pred = mlp.predict(data_test)



### Confusion Matrix

In [22]:
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score

def report_confusion_matrices(ar_class_test, ar_pred):
    mcm =  multilabel_confusion_matrix(ar_class_test, ar_pred)
    unique_labels = categories[0]
    for i, label in enumerate(unique_labels):
        print(f"Confusion Matrix for label '{label}':")
        cm = mcm[i]
        print(cm)
        tp = cm[1, 1]
        fn = cm[1, 0]
        fp = cm[0, 1]
        tn = cm[0, 0]
        
        print(f"True Positive (TP)  : {tp}")
        print(f"False Negative (FN) : {fn}")
        print(f"False Positive (FP) : {fp}")
        print(f"True Negative (TN)  : {tn}")
        print()
    
print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

report_confusion_matrices(class_test, pred)

print("Classification Report : ")
print(classification_report(class_test, pred))

Accuracy :  0.6975169300225733
Mean Square Error :  0.17531978931527462
Confusion Matrix for label 'Enrolled':
[[325  37]
 [ 51  30]]
True Positive (TP)  : 30
False Negative (FN) : 51
False Positive (FP) : 37
True Negative (TN)  : 325

Confusion Matrix for label 'Graduate':
[[180  46]
 [ 37 180]]
True Positive (TP)  : 180
False Negative (FN) : 37
False Positive (FP) : 46
True Negative (TN)  : 180

Confusion Matrix for label 'Dropout':
[[278  20]
 [ 42 103]]
True Positive (TP)  : 103
False Negative (FN) : 42
False Positive (FP) : 20
True Negative (TN)  : 278

Classification Report : 
              precision    recall  f1-score   support

           0       0.45      0.37      0.41        81
           1       0.80      0.83      0.81       217
           2       0.84      0.71      0.77       145

   micro avg       0.75      0.71      0.73       443
   macro avg       0.69      0.64      0.66       443
weighted avg       0.75      0.71      0.72       443
 samples avg       0.70      0

  _warn_prf(average, modifier, msg_start, len(result))


## SVM

In [42]:
from sklearn.svm import SVC
from sklearn import preprocessing

df_svm = dataset.copy()

cats = ['Marital status', 'Application mode', 'Application order', 'Course', 'Daytime/evening attendance\t', 'Previous qualification', 'Nacionality', 'Mother\'s qualification', 'Father\'s qualification', 'Displaced', 'Educational special needs', 'Debtor', 'Tuition fees up to date', 'Gender', 'Scholarship holder', 'International']

df_svm = pd.get_dummies(df_svm, columns=cats)
# df_svm = df_svm.astype(int)

svm_train, svm_test = train_test_split(df_svm, test_size=0.2)

X_svm_train, y_svm_train = svm_train.drop(columns=['Target']), svm_train['Target']
X_svm_test, y_svm_test = svm_test.drop(columns=['Target']), svm_test['Target']

scaler = preprocessing.StandardScaler()
scaler.fit(X_svm_train)
Z_svm_train = scaler.transform(X_svm_train)
Z_svm_test = scaler.transform(X_svm_test)

svm_li = SVC(kernel='linear')
svm_li.fit(Z_svm_train, np.asarray(y_svm_train))

y_pred = svm_li.predict(Z_svm_test)
accuracy = accuracy_score(y_svm_test, y_pred)

print('Linear Kernel')
print(classification_report(y_svm_test, svm_li.predict(Z_svm_test)))

svc_rbf = SVC(kernel='rbf')
svc_rbf.fit(Z_svm_train, np.asarray(y_svm_train))

print('Rbf Kernel')
print(classification_report(y_svm_test, svc_rbf.predict(Z_svm_test)))

Linear Kernel
              precision    recall  f1-score   support

     Dropout       0.82      0.70      0.76       285
    Enrolled       0.51      0.38      0.44       151
    Graduate       0.80      0.94      0.87       449

    accuracy                           0.77       885
   macro avg       0.71      0.68      0.69       885
weighted avg       0.76      0.77      0.76       885

Rbf Kernel
              precision    recall  f1-score   support

     Dropout       0.80      0.69      0.75       285
    Enrolled       0.54      0.32      0.40       151
    Graduate       0.77      0.94      0.84       449

    accuracy                           0.75       885
   macro avg       0.70      0.65      0.67       885
weighted avg       0.74      0.75      0.74       885



## Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 55.76%


## Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5485327313769752


## Hyperparameter Tuning

In [None]:
from sklearn.model_selection import GridSearchCV

max_iterations = [500,800,400]
hidden_layer_siz = [(28, 18, 14), (26, 16, 12), (30, 15, 19)]
learning_rates = 0.20 * np.arange(1, 3)
param_grid = dict(learning_rate_init = learning_rates, hidden_layer_sizes = hidden_layer_siz, max_iter = max_iterations)
grid = GridSearchCV(estimator = mlp, param_grid = param_grid)
grid.fit(X,y)



In [None]:
print("Optimal Hyper-parameters : ", grid.best_params_)
print("Optimal Accuracy : ", grid.best_score_)
best_pred = grid.predict(data_test)
print('Optimal MSE:', mean_squared_error(class_test, best_pred))

Optimal Hyper-parameters :  {'hidden_layer_sizes': (28, 18, 14), 'learning_rate_init': 0.4, 'max_iter': 500}
Optimal Accuracy :  0.7136071273359409
Optimal MSE: 0.08577878103837472
