### Importing Dependencies

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

### Data Collection and Processing

In [3]:
student_performance=pd.read_csv("student_performance.csv")
print(student_performance)

    hours_studied  attendance_percent  sleep_hours  assignments_completed  \
0               2                  60            5                      2   
1               4                  75            6                      3   
2               6                  80            7                      4   
3               1                  50            4                      1   
4               3                  65            6                      3   
5               5                  78            7                      4   
6               7                  85            8                      5   
7               8                  90            8                      5   
8               9                  95            7                      5   
9              10                  98            9                      6   
10              3                  70            6                      3   
11              6                  82            7                      4   

In [4]:
student_performance.head()

Unnamed: 0,hours_studied,attendance_percent,sleep_hours,assignments_completed,pass_fail
0,2,60,5,2,0
1,4,75,6,3,0
2,6,80,7,4,1
3,1,50,4,1,0
4,3,65,6,3,0


In [5]:
student_performance.tail()

Unnamed: 0,hours_studied,attendance_percent,sleep_hours,assignments_completed,pass_fail
15,8,93,9,6,1
16,5,77,6,4,1
17,6,81,7,4,1
18,2,58,5,2,0
19,9,96,8,5,1


In [6]:
student_performance.shape

(20, 5)

In [7]:
student_performance.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 5 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   hours_studied          20 non-null     int64
 1   attendance_percent     20 non-null     int64
 2   sleep_hours            20 non-null     int64
 3   assignments_completed  20 non-null     int64
 4   pass_fail              20 non-null     int64
dtypes: int64(5)
memory usage: 932.0 bytes


In [4]:
student_performance.isnull().sum()

hours_studied            0
attendance_percent       0
sleep_hours              0
assignments_completed    0
pass_fail                0
dtype: int64

### Splitting the data and Target

In [6]:
x=student_performance.drop(["pass_fail"],axis=1)
y=student_performance["pass_fail"]

In [13]:
print(x) 

    hours_studied  attendance_percent  sleep_hours  assignments_completed
0               2                  60            5                      2
1               4                  75            6                      3
2               6                  80            7                      4
3               1                  50            4                      1
4               3                  65            6                      3
5               5                  78            7                      4
6               7                  85            8                      5
7               8                  90            8                      5
8               9                  95            7                      5
9              10                  98            9                      6
10              3                  70            6                      3
11              6                  82            7                      4
12              4                  76 

In [14]:
print(y)

0     0
1     0
2     1
3     0
4     0
5     1
6     1
7     1
8     1
9     1
10    0
11    1
12    0
13    0
14    1
15    1
16    1
17    1
18    0
19    1
Name: pass_fail, dtype: int64


### Splitting Training and Test data

In [7]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.4,random_state=2)

### Model Training

In [8]:
model=RandomForestClassifier(n_estimators=100,random_state=2)
model.fit(x_train,y_train)

### Model Evaluation

In [9]:
y_pred=model.predict(x_test)
print(y_pred)

[0 0 0 0 1 1 0 0]


### Performance Metrics

In [10]:
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 1.0


In [8]:
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       1.00      1.00      1.00         6
           1       1.00      1.00      1.00         2

    accuracy                           1.00         8
   macro avg       1.00      1.00      1.00         8
weighted avg       1.00      1.00      1.00         8



### Hyperparameter Tuning (Pre-Pruning)

In [9]:
from sklearn.model_selection import GridSearchCV
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [2, 4, 6],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2],
    'max_features': ['sqrt', 'log2'],
    'criterion': ['gini', 'entropy']
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=2, scoring='accuracy')
grid_search.fit(x_train, y_train)

# Step 3: Best model
print("Best Parameters:", grid_search.best_params_)
print("Best Cross-Validation Score:", grid_search.best_score_)


Best Parameters: {'criterion': 'gini', 'max_depth': 2, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 50}
Best Cross-Validation Score: 0.9166666666666667


  _data = np.array(data, dtype=dtype, copy=copy,


### Trained Final Model with Best Parameters

In [10]:
best_model = RandomForestClassifier(
    n_estimators=50,
    max_depth=2,
    max_features='sqrt',
    min_samples_leaf=1,
    min_samples_split=2,
    criterion='gini',
    random_state=42
)

best_model.fit(x_train, y_train)

# Evaluation
from sklearn.metrics import accuracy_score
y_pred = best_model.predict(x_test)
print("Test Accuracy:", accuracy_score(y_test, y_pred))

Test Accuracy: 1.0


### Making Predictions on New Input Data

In [11]:
input_data={
    "hours_studied":6,
    "attendance_percent":85,
    "sleep_hours":7,
    "assignments_completed":2
}
df=pd.DataFrame([input_data])
predictions=model.predict(df)
if predictions[0]==1:
    print("Student is pass")
else:
     print("Student is fail")

Student is pass
