In [34]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [35]:
# Step 1: Clean and format dataset so that each column has its respective values
raw_df = pd.read_csv("/content/drive/MyDrive/INST414/Module 6 Assignment/data.csv", header=None)

df_split = raw_df[0].str.split(";", expand=True)

df_split.columns = df_split.iloc[0]  # first row becomes header
df_split = df_split.drop(index=0).reset_index(drop=True)

df_split.columns = (
    df_split.columns
    .str.replace(r"[^\w\s/]", "", regex=True)  # remove non-alphanumeric characters EXCEPT slashes
    .str.replace(r"\s+", " ", regex=True)  # normalize multiple spaces to one
    .str.strip() # remove leading/trailing whitespace
)

df_split.columns

# Drop background info (lower importance) columns
columns_to_drop = [
    'Application mode',
    'Application order',
    'Course',
]

df_split = df_split.drop(columns=columns_to_drop)

df_split.to_csv("/content/drive/MyDrive/INST414/Module 6 Assignment/formatted_dataset.csv", index=False)

df = df_split
df.head()

Unnamed: 0,Marital status,Daytime/evening attendance,Previous qualification,Previous qualification grade,Nacionality,Mothers qualification,Fathers qualification,Mothers occupation,Fathers occupation,Admission grade,...,Curricular units 2nd sem credited,Curricular units 2nd sem enrolled,Curricular units 2nd sem evaluations,Curricular units 2nd sem approved,Curricular units 2nd sem grade,Curricular units 2nd sem without evaluations,Unemployment rate,Inflation rate,GDP,Target
0,1,1,1,122.0,1,19,12,5,9,127.3,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,1,1,160.0,1,1,3,3,3,142.5,...,0,6,6,6,13.666666666666666,0,13.9,-0.3,0.79,Graduate
2,1,1,1,122.0,1,37,37,9,9,124.8,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,1,1,122.0,1,38,37,5,3,119.6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,0,1,100.0,1,37,38,9,9,141.5,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [36]:
# Step 2: Convert all columns that should be numeric ===
# This ensures that numeric-looking strings (like "3.5") are treated as actual numbers
df = df.apply(pd.to_numeric, errors='ignore')

  df = df.apply(pd.to_numeric, errors='ignore')


In [37]:
# Step 3: Encode categorical variables (excluding the target column) ===
# For machine learning, categorical string values must be converted to numbers
label_encoders = {}  # keep track of encoders so we can decode later if needed
for col in df.select_dtypes(include=['object']).columns:
    if col != 'Target':  # don't encode the target yet
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
        label_encoders[col] = le

In [38]:
# Step 4: Encode the target variable (e.g., Graduate, Dropout, Enrolled) ===
le_target = LabelEncoder()
df['Target'] = le_target.fit_transform(df['Target'])
label_mapping = dict(zip(le_target.classes_, le_target.transform(le_target.classes_)))  # e.g., {'Dropout': 0, 'Enrolled': 1, 'Graduate': 2}

In [39]:
# Step 5: Split data into features (X) and labels (y) ===
X = df.drop(columns=['Target'])  # all other columns are features
y = df['Target']                 # label column is our prediction target

In [40]:
# Step 6: Split into training and test sets (70% train, 30% test) ===
# Random_state ensures reproducible results
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [41]:
# Step 7: Train a Decision Tree Classifier ===
# Simple and interpretable model; you could also try RandomForest or LogisticRegression
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [42]:
# Step 8: Evaluate model performance ===
# Accuracy: Overall correct predictions
# Confusion Matrix: Breakdown of predictions by class
# Classification Report: Precision, recall, and f1-score for each class
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred, target_names=le_target.classes_)

print("Model Accuracy:", accuracy)
print("\nConfusion Matrix:\n", conf_matrix)
print("\nClassification Report:\n", class_report)

Model Accuracy: 0.7710843373493976

Confusion Matrix:
 [[350  24  67]
 [ 65  78 102]
 [ 17  29 596]]

Classification Report:
               precision    recall  f1-score   support

     Dropout       0.81      0.79      0.80       441
    Enrolled       0.60      0.32      0.41       245
    Graduate       0.78      0.93      0.85       642

    accuracy                           0.77      1328
   macro avg       0.73      0.68      0.69      1328
weighted avg       0.76      0.77      0.75      1328



In [43]:
# Step 9: Analyze 5 samples that were misclassified
wrong_indices = np.where(y_test != y_pred)[0]

# Initialize a list to collect rows for CSV
misclassified_data = []

print("\n--- 5 Misclassified Samples ---")
for i in range(min(5, len(wrong_indices))):
    idx = wrong_indices[i]
    input_features = X_test.iloc[idx].to_dict()
    true_label = le_target.inverse_transform([y_test.iloc[idx]])[0]
    predicted_label = le_target.inverse_transform([y_pred[idx]])[0]

    print(f"\nSample {i+1}:")
    print("Input features:", input_features)
    print("True label:", true_label)
    print("Predicted label:", predicted_label)

    # Add sample info to the list for CSV
    row = {
        "Sample": i + 1,
        "True Label": true_label,
        "Predicted Label": predicted_label
    }
    row.update(input_features)
    misclassified_data.append(row)

# Convert list of dicts to DataFrame and save as CSV
df_misclassified = pd.DataFrame(misclassified_data)
df_misclassified.to_csv("misclassified_samples.csv", index=False)

print("\nSaved misclassified samples to 'misclassified_samples.csv'")


--- 5 Misclassified Samples ---

Sample 1:
Input features: {'Marital status': 1.0, 'Daytime/evening attendance': 0.0, 'Previous qualification': 15.0, 'Previous qualification grade': 170.0, 'Nacionality': 1.0, 'Mothers qualification': 34.0, 'Fathers qualification': 1.0, 'Mothers occupation': 90.0, 'Fathers occupation': 2.0, 'Admission grade': 101.0, 'Displaced': 0.0, 'Educational special needs': 0.0, 'Debtor': 0.0, 'Tuition fees up to date': 1.0, 'Gender': 0.0, 'Scholarship holder': 0.0, 'Age at enrollment': 32.0, 'International': 0.0, 'Curricular units 1st sem credited': 0.0, 'Curricular units 1st sem enrolled': 6.0, 'Curricular units 1st sem evaluations': 9.0, 'Curricular units 1st sem approved': 4.0, 'Curricular units 1st sem grade': 12.0, 'Curricular units 1st sem without evaluations': 0.0, 'Curricular units 2nd sem credited': 0.0, 'Curricular units 2nd sem enrolled': 6.0, 'Curricular units 2nd sem evaluations': 12.0, 'Curricular units 2nd sem approved': 4.0, 'Curricular units 2nd 

In [44]:
# Validation of Output
df = pd.read_csv('/content/drive/MyDrive/INST414/Module 6 Assignment/formatted_dataset.csv')
label_counts = df['Target'].value_counts()
print(label_counts )

Target
Graduate    2209
Dropout     1421
Enrolled     794
Name: count, dtype: int64
