### Inspect Data

In [10]:
import pandas as pd
import numpy as np

dataset = pd.read_csv("data.csv", sep=';')

print("Target : ")
print(dataset['Target'].unique())

print("Dimensions of the dataset : ", dataset.shape)

print('\nNumber of samples for each flower species:')
print(dataset["Target"].value_counts())

Target : 
['Dropout' 'Graduate' 'Enrolled']
Dimensions of the dataset :  (4424, 37)

Number of samples for each flower species:
Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64


In [3]:
# check if there is any missing value
missing_values = dataset.isnull().sum()
if missing_values.any():
    print("\nThere are missing values in the dataset.")
else:
    print("\nNo missing values found in the dataset.")


No missing values found in the dataset.


In [12]:
dataset.columns

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance\t', 'Previous qualification',
       'Previous qualification (grade)', 'Nacionality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Admission grade',
       'Displaced', 'Educational special needs', 'Debtor',
       'Tuition fees up to date', 'Gender', 'Scholarship holder',
       'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd

Split data to training set and testing set

In [33]:
from sklearn.model_selection import train_test_split
X = dataset.drop('Target', axis = 1)
y = dataset['Target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

# Try Different Models

## Decision Tree Classifier

In [37]:
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier 
from sklearn import tree

clf = DecisionTreeClassifier(random_state=1234)
dtree_model = clf.fit(X_train, y_train)
dtree_predictions = clf.predict(X_test)
  
cm = confusion_matrix(y_test, dtree_predictions)
print(cm)

true_positives = np.diag(cm).sum()
total = cm.sum()

# Calculate accuracy
accuracy = true_positives / total

print("Accuracy:", accuracy)



[[104  20  21]
 [ 16  25  28]
 [ 12  29 188]]
Accuracy: 0.7155756207674944


## Neural Network

In [99]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

X = dataset.drop('Target', axis = 1)
y = dataset['Target']

# normalize data
scaler = MinMaxScaler(feature_range=(0, 1))
X_rescaled = scaler.fit_transform(X)
X = pd.DataFrame(data = X_rescaled, columns = X.columns)

categories = [['Enrolled', 'Graduate', 'Dropout']]
encoder = OneHotEncoder(categories=categories, sparse_output=False)
y = encoder.fit_transform(y.values.reshape(-1,1))

data_train, data_test, class_train, class_test = train_test_split(X, y, test_size=0.1)
mlp = MLPClassifier(solver = 'sgd', random_state = 42, activation = 'logistic', learning_rate_init = 0.4, batch_size = 100, hidden_layer_sizes = (23, 17, 12), max_iter = 500)
mlp.fit(data_train, class_train)
pred = mlp.predict(data_test)



### Confusion Matrix

In [100]:
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix
from sklearn.metrics import mean_squared_error, accuracy_score, precision_score, recall_score

def report_confusion_matrices(ar_class_test, ar_pred):
    mcm =  multilabel_confusion_matrix(ar_class_test, ar_pred)
    unique_labels = categories[0]
    for i, label in enumerate(unique_labels):
        print(f"Confusion Matrix for label '{label}':")
        cm = mcm[i]
        print(cm)
        tp = cm[1, 1]
        fn = cm[1, 0]
        fp = cm[0, 1]
        tn = cm[0, 0]
        
        print(f"True Positive (TP)  : {tp}")
        print(f"False Negative (FN) : {fn}")
        print(f"False Positive (FP) : {fp}")
        print(f"True Negative (TN)  : {tn}")
        print()
    
print("Accuracy : ", accuracy_score(class_test, pred))
print("Mean Square Error : ", mean_squared_error(class_test, pred))

report_confusion_matrices(class_test, pred)

print("Classification Report : ")
print(classification_report(class_test, pred))

Accuracy :  0.7200902934537246
Mean Square Error :  0.16930022573363432
Confusion Matrix for label 'Enrolled':
[[336  30]
 [ 52  25]]
True Positive (TP)  : 25
False Negative (FN) : 52
False Positive (FP) : 30
True Negative (TN)  : 336

Confusion Matrix for label 'Graduate':
[[167  46]
 [ 36 194]]
True Positive (TP)  : 194
False Negative (FN) : 36
False Positive (FP) : 46
True Negative (TN)  : 167

Confusion Matrix for label 'Dropout':
[[282  25]
 [ 36 100]]
True Positive (TP)  : 100
False Negative (FN) : 36
False Positive (FP) : 25
True Negative (TN)  : 282

Classification Report : 
              precision    recall  f1-score   support

           0       0.45      0.32      0.38        77
           1       0.81      0.84      0.83       230
           2       0.80      0.74      0.77       136

   micro avg       0.76      0.72      0.74       443
   macro avg       0.69      0.63      0.66       443
weighted avg       0.74      0.72      0.73       443
 samples avg       0.72      0

  _warn_prf(average, modifier, msg_start, len(result))


#### Hyperparameter Tuning

In [82]:
from sklearn.model_selection import GridSearchCV

max_iterations = [500,800,400]
hidden_layer_siz = [(28, 18, 14), (26, 16, 12), (30, 15, 19)]
learning_rates = 0.20 * np.arange(1, 3)
param_grid = dict(learning_rate_init = learning_rates, hidden_layer_sizes = hidden_layer_siz, max_iter = max_iterations)
grid = GridSearchCV(estimator = mlp, param_grid = param_grid)
grid.fit(X,y)



In [83]:
print("Optimal Hyper-parameters : ", grid.best_params_)
print("Optimal Accuracy : ", grid.best_score_)
best_pred = grid.predict(data_test)
print('Optimal MSE:', mean_squared_error(class_test, best_pred))

Optimal Hyper-parameters :  {'hidden_layer_sizes': (28, 18, 14), 'learning_rate_init': 0.4, 'max_iter': 500}
Optimal Accuracy :  0.7136071273359409
Optimal MSE: 0.07975921745673438


## SVM

In [103]:
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')

svm_model.fit(X_train, y_train)

y_pred = svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 79.91%


## Naive Bayes

In [106]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 67.95%


## Random Forest Classifier

In [107]:
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred = rf_classifier.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7923250564334086
