### Loading the Dataset

In [1]:
import pandas as pd
df= pd.read_csv("student-mat.csv", sep=';')
df.head()

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10
3,GP,F,15,U,GT3,T,4,2,health,services,...,3,2,2,1,1,5,2,15,14,15
4,GP,F,16,U,GT3,T,3,3,other,other,...,4,3,2,1,2,5,4,6,10,10


### Data Exploration

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 395 entries, 0 to 394
Data columns (total 40 columns):
 #   Column             Non-Null Count  Dtype
---  ------             --------------  -----
 0   age                395 non-null    int64
 1   Medu               395 non-null    int64
 2   Fedu               395 non-null    int64
 3   traveltime         395 non-null    int64
 4   studytime          395 non-null    int64
 5   failures           395 non-null    int64
 6   famrel             395 non-null    int64
 7   freetime           395 non-null    int64
 8   goout              395 non-null    int64
 9   Dalc               395 non-null    int64
 10  Walc               395 non-null    int64
 11  health             395 non-null    int64
 12  absences           395 non-null    int64
 13  pass               395 non-null    int64
 14  school_MS          395 non-null    bool 
 15  sex_M              395 non-null    bool 
 16  address_U          395 non-null    bool 
 17  famsize_LE3     

In [3]:
df.describe()

Unnamed: 0,age,Medu,Fedu,traveltime,studytime,failures,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
count,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0,395.0
mean,16.696203,2.749367,2.521519,1.448101,2.035443,0.334177,3.944304,3.235443,3.108861,1.481013,2.291139,3.55443,5.708861,10.908861,10.713924,10.41519
std,1.276043,1.094735,1.088201,0.697505,0.83924,0.743651,0.896659,0.998862,1.113278,0.890741,1.287897,1.390303,8.003096,3.319195,3.761505,4.581443
min,15.0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,3.0,0.0,0.0
25%,16.0,2.0,2.0,1.0,1.0,0.0,4.0,3.0,2.0,1.0,1.0,3.0,0.0,8.0,9.0,8.0
50%,17.0,3.0,2.0,1.0,2.0,0.0,4.0,3.0,3.0,1.0,2.0,4.0,4.0,11.0,11.0,11.0
75%,18.0,4.0,3.0,2.0,2.0,0.0,5.0,4.0,4.0,2.0,3.0,5.0,8.0,13.0,13.0,14.0
max,22.0,4.0,4.0,4.0,4.0,3.0,5.0,5.0,5.0,5.0,5.0,5.0,75.0,19.0,19.0,20.0


In [5]:
df.columns

Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')

In [6]:
df.shape

(395, 33)

In [7]:
df.dtypes

school        object
sex           object
age            int64
address       object
famsize       object
Pstatus       object
Medu           int64
Fedu           int64
Mjob          object
Fjob          object
reason        object
guardian      object
traveltime     int64
studytime      int64
failures       int64
schoolsup     object
famsup        object
paid          object
activities    object
nursery       object
higher        object
internet      object
romantic      object
famrel         int64
freetime       int64
goout          int64
Dalc           int64
Walc           int64
health         int64
absences       int64
G1             int64
G2             int64
G3             int64
dtype: object

### Target Variable Creation

In [8]:
df['pass']= df['G3'].apply(lambda x: 1 if x>=10 else 0)  #We're converting it into a binary label
df.drop(['G1','G2','G3'], axis= 1, inplace= True)

### Encoding Categorical Features

In [9]:
df= pd.get_dummies(df, drop_first= True)
#Many features like sex, school, address, etc. are categorical (non-numeric).
#pd.get_dummies() converts them into numeric format using One-Hot Encoding.
#drop_first=True avoids multicollinearity by removing the first column (to prevent dummy variable trap).

### Splitting Data

In [11]:
import numpy as np
from sklearn.model_selection import train_test_split
x= df.drop('pass', axis= 1)
y= df['pass']
x_train, x_test, y_train, y_test= train_test_split(x,y, test_size= 0.2, random_state= 42)

### Training

In [15]:
from sklearn.ensemble import RandomForestClassifier
model= RandomForestClassifier(n_estimators=100, random_state= 42)  #n_estimators = Number of trees
model.fit(x_train, y_train)

### Prediction

In [16]:
y_pred= model.predict(x_test)

### Evaluate Model 

In [18]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.6582278481012658

Confusion Matrix:
 [[ 5 22]
 [ 5 47]]

Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.19      0.27        27
           1       0.68      0.90      0.78        52

    accuracy                           0.66        79
   macro avg       0.59      0.54      0.52        79
weighted avg       0.62      0.66      0.60        79



### Predicting on a New Student

In [27]:
import matplotlib.pyplot as plt
import seaborn as sns
#  1. Select a sample student from test set
student = x_test.iloc[0]

#  2. Convert to DataFrame for plotting
student_df = pd.DataFrame({'Feature': student.index, 'Value': student.values})

#  Ensure all values are numeric (convert bool to int)
student_df['Value'] = pd.to_numeric(student_df['Value'], errors='coerce')

#  3. Create input as DataFrame to preserve feature names
student_input = pd.DataFrame([student])

#  4. Predict using model
prediction = model.predict(student_input)[0]
result = "✅ Pass" if prediction == 1 else "❌ Fail"

# 📢 5. Textual result
from IPython.display import Markdown
Markdown(f"### The model predicts: **{result}**")


### The model predicts: **❌ Fail**