In [None]:
import numpy as np
import pandas as pd
from tqdm import tqdm

# Data Visualization
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots


# Data pre-processing
from sklearn.preprocessing import StandardScaler

# Data splitting
from sklearn.model_selection import train_test_split

# Machine learning Models
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier


# Evaluation metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import roc_curve, auc


#Installing dependencies
!pip install -U kaleido

Collecting kaleido
  Downloading kaleido-1.1.0-py3-none-any.whl.metadata (5.6 kB)
Collecting choreographer>=1.0.10 (from kaleido)
  Downloading choreographer-1.1.1-py3-none-any.whl.metadata (6.8 kB)
Collecting logistro>=1.0.8 (from kaleido)
  Downloading logistro-1.1.0-py3-none-any.whl.metadata (2.6 kB)
Collecting pytest-timeout>=2.4.0 (from kaleido)
  Downloading pytest_timeout-2.4.0-py3-none-any.whl.metadata (20 kB)
Downloading kaleido-1.1.0-py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.3/66.3 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading choreographer-1.1.1-py3-none-any.whl (52 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.3/52.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading logistro-1.1.0-py3-none-any.whl (7.9 kB)
Downloading pytest_timeout-2.4.0-py3-none-any.whl (14 kB)
Installing collected packages: logistro, pytest-timeout, choreographer, kaleido
Successfully installed choreogr

In [None]:
# Read dataset from csv file
from google.colab import files
uploaded = files.upload()
data =  pd.read_csv("dataset.csv")

# Correcting typographical error for nationality
data.rename(columns = {'Nacionality':'Nationality'}, inplace = True)
data.head()

Saving dataset.csv to dataset.csv


Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,...,Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
0,1,8,5,2,1,1,1,13,10,6,...,0,0,0,0,0.0,0,10.8,1.4,1.74,Dropout
1,1,6,1,11,1,1,1,1,3,4,...,0,6,6,6,13.666667,0,13.9,-0.3,0.79,Graduate
2,1,1,5,5,1,1,1,22,27,10,...,0,6,0,0,0.0,0,10.8,1.4,1.74,Dropout
3,1,8,2,15,1,1,1,23,27,6,...,0,6,10,5,12.4,0,9.4,-0.8,-3.12,Graduate
4,2,12,1,3,0,1,1,22,28,10,...,0,6,6,6,13.0,0,13.9,-0.3,0.79,Graduate


In [None]:
data.shape

(4424, 35)

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nationality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                      

In [None]:
# descriptive analysis
pd.set_option('display.max_columns', None)
data.describe(include = 'all')

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Nationality,Mother's qualification,Father's qualification,Mother's occupation,Father's occupation,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Curricular units 1st sem (credited),Curricular units 1st sem (enrolled),Curricular units 1st sem (evaluations),Curricular units 1st sem (approved),Curricular units 1st sem (grade),Curricular units 1st sem (without evaluations),Curricular units 2nd sem (credited),Curricular units 2nd sem (enrolled),Curricular units 2nd sem (evaluations),Curricular units 2nd sem (approved),Curricular units 2nd sem (grade),Curricular units 2nd sem (without evaluations),Unemployment rate,Inflation rate,GDP,Target
count,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424.0,4424
unique,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,3
top,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Graduate
freq,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2209
mean,1.178571,6.88698,1.727848,9.899186,0.890823,2.53142,1.254521,12.322107,16.455244,7.317812,7.819168,0.548373,0.011528,0.113698,0.880651,0.351718,0.248418,23.265145,0.024864,0.709991,6.27057,8.299051,4.7066,10.640822,0.137658,0.541817,6.232143,8.063291,4.435805,10.230206,0.150316,11.566139,1.228029,0.001969,
std,0.605747,5.298964,1.313793,4.331792,0.311897,3.963707,1.748447,9.026251,11.0448,3.997828,4.856692,0.497711,0.10676,0.31748,0.324235,0.47756,0.432144,7.587816,0.155729,2.360507,2.480178,4.179106,3.094238,4.843663,0.69088,1.918546,2.195951,3.947951,3.014764,5.210808,0.753774,2.66385,1.382711,2.269935,
min,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,17.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7.6,-0.8,-4.06,
25%,1.0,1.0,1.0,6.0,1.0,1.0,1.0,2.0,3.0,5.0,5.0,0.0,0.0,0.0,1.0,0.0,0.0,19.0,0.0,0.0,5.0,6.0,3.0,11.0,0.0,0.0,5.0,6.0,2.0,10.75,0.0,9.4,0.3,-1.7,
50%,1.0,8.0,1.0,10.0,1.0,1.0,1.0,13.0,14.0,6.0,8.0,1.0,0.0,0.0,1.0,0.0,0.0,20.0,0.0,0.0,6.0,8.0,5.0,12.285714,0.0,0.0,6.0,8.0,5.0,12.2,0.0,11.1,1.4,0.32,
75%,1.0,12.0,2.0,13.0,1.0,1.0,1.0,22.0,27.0,10.0,10.0,1.0,0.0,0.0,1.0,1.0,0.0,25.0,0.0,0.0,7.0,10.0,6.0,13.4,0.0,0.0,7.0,10.0,6.0,13.333333,0.0,13.9,2.6,1.79,


In [None]:
#Checking for null values
data.isnull().any().sum()

np.int64(0)

In [None]:
# Create a copy for exploratory data analysis
data_viz = data.copy()

In [None]:
data['Target'].unique()

array(['Dropout', 'Graduate', 'Enrolled'], dtype=object)

In [None]:
import plotly.express as px

fig = px.histogram(
    data_viz,
    x='Age at enrollment',
    color='Target',
    opacity=0.75,
    barmode='overlay',
    width=800,
    height=500,
    color_discrete_sequence=px.colors.qualitative.G10
)

fig.update_layout(title='Age distribution of students')

# ✅ Show interactively in Colab/Jupyter
fig.show()

# ✅ Save interactive HTML (no Kaleido required)
fig.write_html("fig.html")






This means that static image generation (e.g. `fig.write_image()`) will not work.

Please upgrade Plotly to version 6.1.1 or greater, or downgrade Kaleido to version 0.2.1.




In [None]:
import plotly.express as px

# Distribution of Target feature
fig = px.pie(
    values=data_viz['Target'].value_counts(),
    names=data_viz['Target'].value_counts().index.to_list()
)

fig.update_traces(
    textposition='inside',
    textinfo='percent+label',
    marker=dict(colors=['teal', 'goldenrod', 'slateblue'])
)

fig.update_layout(
    showlegend=False,
    height=400,
    width=800,
    title='Distribution of Target'
)

# Show interactively inside Colab/Jupyter
fig.show()

# Save as interactive HTML file (no Kaleido needed)
fig.write_html("fig.html")


In [None]:
def get_dictionaries(category_list, dfcolumn_name, target_col, dictionary_list):
  '''This function returns a list of dictionaries for value count of each target label per category'''
  for each_category in category_list:
    a_dictionary = dict(data_viz[data_viz[dfcolumn_name]== each_category][target_col].value_counts())
    dictionary_list.append(a_dictionary)
  return dictionary_list

In [None]:
def make_pie(dictionary_list, trace_list, colors_list, textposition = 'inside'):
  '''This function returns a list of traces for pie charts'''
  for dictionary in dictionary_list:
    trace = go.Pie(values = list(dictionary.values()), labels = list(dictionary.keys()),
           textposition = textposition, textinfo='percent+label',
           marker=dict(colors=colors_list))
    trace_list.append(trace)
  return trace_list

In [None]:
data["Target_binary"] = data["Target"].apply(lambda x: 1 if x == "Dropout" else 0)
X = data.drop(["Target", "Target_binary"], axis=1)
y = data["Target_binary"]

In [None]:
# Change gender from numerical to catgorical
data_viz['Gender'] = data_viz['Gender'].map({1:'Male', 0:'Female'})

In [None]:
import plotly.express as px

# Calculate percentage count for gender
percent_count = round(data_viz['Gender'].value_counts(normalize=True) * 100, 1)

fig = px.bar(
    percent_count,
    orientation='h',
    text_auto=True
)

fig.update_traces(marker=dict(color=['red']))

fig.update_layout(
    height=400,
    width=700,
    title='Gender Percentage Count of Students',
    showlegend=False,
    font=dict(size=14),
    xaxis_title='%count',
    yaxis_title='Gender'
)

# ✅ Show interactively
fig.show()

# ✅ Save interactive HTML (portable, no Kaleido required)
fig.write_html("gender_percent.html")


In [None]:
# Getting dictionaries for genders
genders = data_viz['Gender'].unique()
genders_dictionaries = get_dictionaries(genders, 'Gender', 'Target', [])
genders_dictionaries

[{'Dropout': np.int64(701),
  'Graduate': np.int64(548),
  'Enrolled': np.int64(307)},
 {'Graduate': np.int64(1661),
  'Dropout': np.int64(720),
  'Enrolled': np.int64(487)}]

In [None]:
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

# --- Create subplots for Target distributions ---
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=['Original Target (3 classes)', 'Binary Target (Dropout vs Not Dropout)'],
    specs=[[{'type': 'pie'}, {'type': 'pie'}]]
)

# Original Target (3-class pie chart)
fig.add_trace(
    go.Pie(
        labels=data['Target'].value_counts().index,
        values=data['Target'].value_counts().values,
        marker=dict(colors=['teal', 'goldenrod', 'slateblue']),
        textinfo='percent+label'
    ),
    row=1, col=1
)

# Binary Target (Dropout vs Not Dropout pie chart)
fig.add_trace(
    go.Pie(
        labels=data['Target_binary'].value_counts().index.map({0: "Not Dropout", 1: "Dropout"}),
        values=data['Target_binary'].value_counts().values,
        marker=dict(colors=['indianred', 'seagreen']),
        textinfo='percent+label'
    ),
    row=1, col=2
)

# Update layout
fig.update_layout(
    height=500,
    width=900,
    title='Comparison of Original vs Binary Target Distribution',
    font=dict(size=14)
)

# ✅



In [None]:
# Create binary target: Dropout = 1, others = 0
data["Target_binary"] = data["Target"].apply(lambda x: 1 if x == "Dropout" else 0)

# Features and target
X = data.drop(["Target", "Target_binary"], axis=1)
y = data["Target_binary"]

# 🔍 Show first few rows
print("First 5 rows of data with Target_binary:")
print(data[["Target", "Target_binary"]].head())

# 🔍 Check class distribution
print("\nClass distribution in Target_binary:")
print(y.value_counts())

# 🔍 Show shapes
print("\nShapes:")
print("X:", X.shape)
print("y:", y.shape)


First 5 rows of data with Target_binary:
     Target  Target_binary
0   Dropout              1
1  Graduate              0
2   Dropout              1
3  Graduate              0
4  Graduate              0

Class distribution in Target_binary:
Target_binary
0    3003
1    1421
Name: count, dtype: int64

Shapes:
X: (4424, 34)
y: (4424,)


In [None]:
import plotly.express as px

fig = px.pie(
    values=data["Target_binary"].value_counts(),
    names=data["Target_binary"].value_counts().index.map({0: "Not Dropout", 1: "Dropout"})
)

fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(title="Distribution of Binary Target")

fig.show()


In [None]:
# Converting the courses back to categorical
data_viz['Course'] = data_viz['Course'].map({1: 'Biofuel Production Technologies',
 2: 'Animation and Multimedia Design', 3: 'Social Service (evening attendance)',
 4: 'Agronomy', 5: 'Communication Design', 6: 'Veterinary Nursing',
 7: 'Informatics Engineering', 8: 'Equiniculture', 9: 'Management',
 10: 'Social Service', 11: 'Tourism', 12: 'Nursing', 13: 'Oral Hygiene',
 14: 'Advertising and Marketing Management', 15: 'Journalism and Communication',
 16: 'Basic Education', 17: 'Management (evening attendance)'})

In [None]:
# Bar chart for courses
percent_count = data_viz['Course'].value_counts()

fig = px.bar(
    percent_count,
    orientation='h',
    text_auto=True
)

# Customize colors
fig.update_traces(marker=dict(color=['red','orangered','darkorange','orange','yellow']))

# Layout settings
fig.update_layout(
    height=500,
    width=1000,
    title='Courses enrolled by students',
    showlegend=False,
    font=dict(size=14),
    xaxis_title='Total',
    yaxis_title='Courses'
)

# ✅ Show interactively (works in Colab/Jupyter)
fig.show()

# ✅ Save as interactive HTML (no Kaleido needed)
fig.write_html("courses_enrolled.html")


In [None]:
import pandas as pd
import plotly.express as px

# 1) Compute counts and percents explicitly
df_counts = (
    data_viz.groupby(['Course', 'Target'])
    .size()
    .reset_index(name='count')
)
df_counts['total_course'] = df_counts.groupby('Course')['count'].transform('sum')
df_counts['percent'] = df_counts['count'] / df_counts['total_course'] * 100

# Quick sanity checks
print("Sample counts:\n", df_counts.head())
print("\nCourses that have only one Target (will show 100%):")
single_target_courses = df_counts[df_counts['count'] == df_counts['total_course']]['Course'].unique()
print(single_target_courses)

# ----- Option A: Stacked 100% per course (segments sum to 100% for each course) -----
fig_stack = px.bar(
    df_counts,
    x='percent',
    y='Course',
    color='Target',
    orientation='h',
    color_discrete_sequence=px.colors.qualitative.Dark2,
    text=df_counts['percent'].round(1).astype(str) + '%'   # show percent inside
)
fig_stack.update_layout(
    barmode='stack',
    title='Percent distribution of Target within each Course (stacked, sums to 100%)',
    xaxis_title='% within course',
    height=1200, width=1000
)
fig_stack.update_traces(textposition='inside', insidetextanchor='middle')

fig_stack.show()
fig_stack.write_html("courses_target_percent_stacked.html")

# ----- Option B: Grouped bars showing percent per course (side-by-side) -----
fig_group = px.bar(
    df_counts,
    x='percent',
    y='Course',
    color='Target',
    orientation='h',
    color_discrete_sequence=px.colors.qualitative.Dark2
)
fig_group.update_layout(
    barmode='group',
    title='Percent distribution of Target within each Course (grouped, side-by-side)',
    xaxis_title='% within course',
    height=1200, width=1000
)
fig_group.update_traces(texttemplate='%{x:.1f}%', textposition='outside')

fig_group.show()
fig_group.write_html("courses_target_percent_grouped.html")



Sample counts:
                                  Course    Target  count  total_course  \
0  Advertising and Marketing Management   Dropout     95           268   
1  Advertising and Marketing Management  Enrolled     48           268   
2  Advertising and Marketing Management  Graduate    125           268   
3                              Agronomy   Dropout     86           210   
4                              Agronomy  Enrolled     37           210   

     percent  
0  35.447761  
1  17.910448  
2  46.641791  
3  40.952381  
4  17.619048  

Courses that have only one Target (will show 100%):
[]


In [None]:

# Converting back to categorical
data_viz['Marital status'] = data_viz['Marital status'].map({1:'Single', 2: 'Married',
                                                             3: 'Widower', 4: 'Divorced',
                                                             5: 'Facto union', 6: 'Legally Separated'})

In [None]:
# Bar chart for Marital status
count = data_viz['Marital status'].value_counts()

fig = px.bar(
    count,
    orientation='h',
    text_auto=True
)

# Customize colors
fig.update_traces(marker=dict(color=['red', 'orange']))

# Layout settings
fig.update_layout(
    height=500,
    width=800,
    title='Total Marital Status of Students',
    showlegend=False,
    font=dict(size=14),
    xaxis_title='Total',
    yaxis_title='Marital status'
)

# ✅ Show interactively in Colab
fig.show()

# ✅ Save as interactive HTML (no Kaleido needed)
fig.write_html("marital_status.html")


In [None]:
import pandas as pd
import plotly.express as px

# Compute counts and percentages explicitly
df_marital = (
    data_viz.groupby(['Marital status', 'Target'])
    .size()
    .reset_index(name='count')
)
df_marital['total'] = df_marital.groupby('Marital status')['count'].transform('sum')
df_marital['percent'] = df_marital['count'] / df_marital['total'] * 100

# Grouped % bar chart
fig = px.bar(
    df_marital,
    x='Marital status',
    y='percent',
    color='Target',
    barmode='group',
    text=df_marital['percent'].round(1).astype(str) + '%',
    color_discrete_sequence=px.colors.qualitative.Dark2,
    width=1200,
    height=500
)

fig.update_layout(
    title='Marital Status by Target (Percent)',
    yaxis_title='% within Marital Status',
    xaxis_title='Marital Status',
    font=dict(size=14)
)

fig.show()
fig.write_html("marital_status_by_target_percent.html")


In [None]:
def sub_plots(chart_to_plot, row, col):
  '''This function generates the list of subplots'''
  cols = chart_to_plot * col
  rows = [cols] * row
  return rows

In [None]:
# Converting columns back to categorical
to_convert = ['Debtor', 'Tuition fees up to date', 'Scholarship holder']
for i in to_convert:
    data_viz[i] = data_viz[i].map({1:'Yes', 0: 'No'})

In [None]:
# Getting dictionaries for students in debt/ not in debt
debt = data_viz['Debtor'].unique()
debt_dictionaries = get_dictionaries(debt, 'Debtor', 'Target', [])
debt_dictionaries

[{'Graduate': np.int64(2108),
  'Dropout': np.int64(1109),
  'Enrolled': np.int64(704)},
 {'Dropout': np.int64(312),
  'Graduate': np.int64(101),
  'Enrolled': np.int64(90)}]

In [None]:
# Getting dictionaries for students with scholarship/ no scholarship
scholarship = data_viz['Scholarship holder'].unique()
scholarship_dictionaries = get_dictionaries(scholarship, 'Scholarship holder', 'Target', [])
scholarship_dictionaries

[{'Graduate': np.int64(1374),
  'Dropout': np.int64(1287),
  'Enrolled': np.int64(664)},
 {'Graduate': np.int64(835),
  'Dropout': np.int64(134),
  'Enrolled': np.int64(130)}]

In [None]:
print(data_viz.columns)

Index(['Marital status', 'Application mode', 'Application order', 'Course',
       'Daytime/evening attendance', 'Previous qualification', 'Nationality',
       'Mother's qualification', 'Father's qualification',
       'Mother's occupation', 'Father's occupation', 'Displaced',
       'Educational special needs', 'Debtor', 'Tuition fees up to date',
       'Gender', 'Scholarship holder', 'Age at enrollment', 'International',
       'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)

In [None]:
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go

# Define columns for financial status
financial_cols = ["Debtor", "Tuition fees up to date", "Scholarship holder"]
titles = ["Debtor", "Tuition fees up to date", "Scholarship holder"]

# Create subplot grid
fig = make_subplots(
    rows=3, cols=1,
    subplot_titles=titles,
    specs=[[{'type':'pie'}], [{'type':'pie'}], [{'type':'pie'}]],
    vertical_spacing=0.15
)

# Add a pie for each financial column
for i, col in enumerate(financial_cols, start=1):
    counts = data_viz[col].value_counts()
    fig.add_trace(
        go.Pie(
            labels=counts.index,
            values=counts.values,
            marker=dict(colors=['teal','goldenrod','slateblue']),
            textinfo='percent+label'
        ),
        row=i, col=1
    )

# Update layout
fig.update_layout(
    title="Students' Financial Status",
    height=1200,
    width=800,
    showlegend=False,
    font=dict(size=14)
)

# ✅ Show in Colab/Jupyter
fig.show()

# ✅ Save as interactive HTML (no Kaleido needed)
fig.write_html("financial_status.html")


In [None]:
# Heat map showing correlation of features
fig = px.imshow(
    data.drop('Target', axis=1).corr().round(2),
    text_auto=True,
    aspect="auto",
    color_continuous_scale='RdBu_r'
)

fig.update_layout(
    title='Correlation Analysis for independent features',
    height=800,
    width=1500
)

# ✅ Show interactively (works in Colab/Jupyter)
fig.show()

# ✅ Save as interactive HTML (no Kaleido needed)
fig.write_html("correlation_heatmap.html")


In [None]:
# Extract columns for PCA
data_forPCA = data[['Curricular units 1st sem (credited)',
          'Curricular units 1st sem (enrolled)',
          'Curricular units 1st sem (evaluations)',
          'Curricular units 1st sem (without evaluations)',
          'Curricular units 1st sem (approved)',
          'Curricular units 1st sem (grade)',
          'Curricular units 2nd sem (credited)',
          'Curricular units 2nd sem (enrolled)',
          'Curricular units 2nd sem (evaluations)',
          'Curricular units 2nd sem (without evaluations)',
          'Curricular units 2nd sem (approved)',
          'Curricular units 2nd sem (grade)']]


# PCA with one component
pca = PCA(n_components=1)

# Fit PCA to data and transform it
pca_result = pca.fit_transform(data_forPCA)

# Create a new DataFrame with the reduced feature
df_pca = pd.DataFrame(data=pca_result, columns=['PCA Feature']).squeeze()
data['Curricular 1st and 2nd sem PCA'] = df_pca

In [None]:
# Dropping features
data.drop(['Nationality', 'Mother\'s occupation', 'Father\'s qualification',
          'Curricular units 1st sem (credited)',
          'Curricular units 1st sem (enrolled)',
          'Curricular units 1st sem (evaluations)',
          'Curricular units 1st sem (without evaluations)',
          'Curricular units 1st sem (approved)',
          'Curricular units 1st sem (grade)',
          'Curricular units 2nd sem (credited)',
          'Curricular units 2nd sem (enrolled)',
          'Curricular units 2nd sem (evaluations)',
          'Curricular units 2nd sem (without evaluations)',
          'Curricular units 2nd sem (approved)',
          'Curricular units 2nd sem (grade)', 'Inflation rate', 'GDP',
           'Unemployment rate'], axis = 1, inplace = True)

In [None]:
# Heat map showing correlation of features
fig = px.imshow(
    data.drop('Target', axis=1).corr().round(2),
    text_auto=True,
    aspect="auto",
    color_continuous_scale='RdBu_r'
)

fig.update_layout(
    title='Correlation Analysis for independent features',
    height=800,
    width=1500
)

# ✅ Show interactively in Colab/Jupyter
fig.show()

# ✅ Save as interactive HTML (no Kaleido needed)
fig.write_html("correlation_heatmap.html")


In [None]:
# Assigning x and y features
y = np.array(data['Target'])
X_features = data.drop('Target', axis = 1)
X_features.tail()

Unnamed: 0,Marital status,Application mode,Application order,Course,Daytime/evening attendance,Previous qualification,Mother's qualification,Father's occupation,Displaced,Educational special needs,Debtor,Tuition fees up to date,Gender,Scholarship holder,Age at enrollment,International,Target_binary,Curricular 1st and 2nd sem PCA
4419,1,1,6,15,1,1,1,5,0,0,0,1,1,0,19,0,0,2.211839
4420,1,1,2,15,1,1,1,10,1,0,1,0,0,0,18,1,1,-0.98731
4421,1,1,1,12,1,1,22,10,1,0,0,1,0,1,30,0,1,3.934767
4422,1,1,1,9,1,1,22,5,1,0,0,1,0,1,20,0,0,0.240643
4423,1,5,1,15,1,1,23,10,1,0,0,1,0,0,22,1,0,1.734947


In [None]:
y[:5]

array(['Dropout', 'Graduate', 'Dropout', 'Graduate', 'Graduate'],
      dtype=object)

In [None]:

# Normalizing data
scaler =  StandardScaler()
X = scaler.fit_transform(X_features)
X

array([[-0.29482875,  0.21006857,  2.49089589, ..., -0.15968211,
         1.45371954, -2.42563525],
       [-0.29482875, -0.16740639, -0.55406775, ..., -0.15968211,
        -0.6878906 ,  0.28887762],
       [-0.29482875, -1.11109377,  2.49089589, ..., -0.15968211,
         1.45371954, -2.16256376],
       ...,
       [-0.29482875, -1.11109377, -0.55406775, ..., -0.15968211,
         1.45371954,  0.45198555],
       [-0.29482875, -1.11109377, -0.55406775, ..., -0.15968211,
        -0.6878906 ,  0.02764264],
       [-0.29482875, -0.35614386, -0.55406775, ...,  6.26244216,
        -0.6878906 ,  0.19929284]])

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Use the binary target
X = data.drop(["Target", "Target_binary"], axis=1)
y = data["Target_binary"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ✅ Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Logistic Regression
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train_scaled, y_train)

# Predictions
y_pred = log_reg.predict(X_test_scaled)

print("📌 Logistic Regression Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred, zero_division=0))

📌 Logistic Regression Results
Accuracy: 0.847457627118644
Confusion Matrix:
 [[565  36]
 [ 99 185]]
Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.94      0.89       601
           1       0.84      0.65      0.73       284

    accuracy                           0.85       885
   macro avg       0.84      0.80      0.81       885
weighted avg       0.85      0.85      0.84       885



In [None]:
from sklearn.tree import DecisionTreeClassifier

# Use the binary target
X = data.drop(["Target", "Target_binary"], axis=1)
y = data["Target_binary"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Decision Tree
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print("📌 Decision Tree Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))


📌 Decision Tree Results
Accuracy: 0.7966101694915254
[[500 101]
 [ 79 205]]
              precision    recall  f1-score   support

           0       0.86      0.83      0.85       601
           1       0.67      0.72      0.69       284

    accuracy                           0.80       885
   macro avg       0.77      0.78      0.77       885
weighted avg       0.80      0.80      0.80       885



In [None]:
from sklearn.ensemble import RandomForestClassifier

# Use the binary target
X = data.drop(["Target", "Target_binary"], axis=1)
y = data["Target_binary"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Random Forest
rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

print("📌 Random Forest Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))


📌 Random Forest Results
Accuracy: 0.8508474576271187
[[562  39]
 [ 93 191]]
              precision    recall  f1-score   support

           0       0.86      0.94      0.89       601
           1       0.83      0.67      0.74       284

    accuracy                           0.85       885
   macro avg       0.84      0.80      0.82       885
weighted avg       0.85      0.85      0.85       885



In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Use the binary target
X = data.drop(["Target", "Target_binary"], axis=1)
y = data["Target_binary"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=200,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    use_label_encoder=False,
    eval_metric="logloss"
)
xgb_model.fit(X_train, y_train)
y_pred = xgb_model.predict(X_test)

print("📌 XGBoost Results")
print("Accuracy:", accuracy_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=0))




Parameters: { "use_label_encoder" } are not used.




📌 XGBoost Results
Accuracy: 0.8463276836158192
[[551  50]
 [ 86 198]]
              precision    recall  f1-score   support

           0       0.86      0.92      0.89       601
           1       0.80      0.70      0.74       284

    accuracy                           0.85       885
   macro avg       0.83      0.81      0.82       885
weighted avg       0.84      0.85      0.84       885



In [65]:
# Financial Counselling
financial_risk = data[(data["Debtor"] == 1) & (data["Tuition fees up to date"] == 0)]

# Attendance Counselling
attendance_risk = data[data["Daytime/evening attendance"] == 0]

# Personal Counselling
personal_risk = data[(data["Educational special needs"] == 1) | (data["Displaced"] == 1)]

# Career Counselling
career_risk = data[(data["Scholarship holder"] == 0) & (data["Age at enrollment"] > 25)]

# Print counts
print("Students needing Financial Counselling:", len(financial_risk))
print("Students needing Attendance Counselling:", len(attendance_risk))
print("Students needing Personal Counselling:", len(personal_risk))
print("Students needing Career Counselling:", len(career_risk))

# Optional: unique students across all categories
total_unique = pd.concat([financial_risk, attendance_risk, personal_risk, career_risk]).drop_duplicates()
print("Total unique students needing support:", len(total_unique))


Students needing Financial Counselling: 246
Students needing Attendance Counselling: 483
Students needing Personal Counselling: 2448
Students needing Career Counselling: 943
Total unique students needing support: 3327


In [84]:
# Identify groups
financial_risk = data[(data["Debtor"] == 1) & (data["Tuition fees up to date"] == 0)]
# Attendance Counselling (broader rule: attendance shift = evening OR debtor without fees up to date)
attendance_risk = data[(data["Daytime/evening attendance"] == 0) | (data["Tuition fees up to date"] == 0)]

# Personal Counselling (broader: special needs OR displaced OR debtor)
personal_risk = data[(data["Educational special needs"] == 1) | (data["Displaced"] == 1) | (data["Debtor"] == 1)]

# Career Counselling (older OR no scholarship)
career_risk = data[(data["Scholarship holder"] == 0) | (data["Age at enrollment"] > 23)]


# Create labelled subsets
financial_risk = financial_risk.assign(Counselling="Financial Counselling")
attendance_risk = attendance_risk.assign(Counselling="Attendance Counselling")
personal_risk = personal_risk.assign(Counselling="Personal Counselling")
career_risk = career_risk.assign(Counselling="Career Counselling")

# Combine all
counselling_table = pd.concat([financial_risk, attendance_risk, personal_risk, career_risk])

# Keep only relevant columns
counselling_table = counselling_table[["Application order", "Course", "Age at enrollment", "Counselling"]]


# Show the full counselling table in the notebook
counselling_table


Unnamed: 0,Application order,Course,Age at enrollment,Counselling
9,1,10,18,Financial Counselling
35,1,1,37,Financial Counselling
39,1,8,20,Financial Counselling
99,4,12,19,Financial Counselling
135,1,14,19,Financial Counselling
...,...,...,...,...
4418,1,5,20,Career Counselling
4419,6,15,19,Career Counselling
4420,2,15,18,Career Counselling
4421,1,12,30,Career Counselling
