TASK 1: Data Loading and Exploration

In [1]:
import pandas as pd
import numpy as np

np.random.seed(42)

data = {
    "Age": np.random.randint(18, 25, 100),
    "Study_Hours": np.random.uniform(1, 6, 100),
    "Attendance": np.random.uniform(60, 100, 100),
    "Internal_Marks": np.random.randint(30, 50, 100)
}

df = pd.DataFrame(data)

# Apply result logic
df["Result"] = np.where(
    (df["Attendance"] >= 75) &
    (df["Internal_Marks"] >= 40) &
    (df["Study_Hours"] >= 2),
    "Pass",
    "Fail"
)

# Introduce missing values intentionally
df.loc[5, "Study_Hours"] = np.nan
df.loc[20, "Attendance"] = np.nan
df.loc[50, "Internal_Marks"] = np.nan

# Save to CSV
df.to_csv("student_performance.csv", index=False)

df.head()


Unnamed: 0,Age,Study_Hours,Attendance,Internal_Marks,Result
0,24,1.442463,86.885422,32.0,Fail
1,21,1.979914,90.464785,37.0,Fail
2,22,1.226136,69.505502,43.0,Fail
3,24,2.626652,89.128654,47.0,Pass
4,20,2.943386,74.711325,44.0,Fail


In [4]:
#Load the Dataset Using Python
df = pd.read_csv("student_performance.csv")


In [5]:
print("Number of records:", df.shape[0])


Number of records: 100


In [6]:
print("Number of attributes:", df.shape[1])


Number of attributes: 5


In [7]:
print("Class distribution:")
print(df["Result"].value_counts())


Class distribution:
Result
Fail    79
Pass    21
Name: count, dtype: int64


TASK 2: Data Preprocessing


In [9]:
# Handle Missing Values
print("Missing values before handling:")
print(df.isnull().sum())


Missing values before handling:
Age               0
Study_Hours       1
Attendance        1
Internal_Marks    1
Result            0
dtype: int64


In [10]:
#Apply mean imputation
df["Study_Hours"].fillna(df["Study_Hours"].mean(), inplace=True)
df["Attendance"].fillna(df["Attendance"].mean(), inplace=True)
df["Internal_Marks"].fillna(df["Internal_Marks"].mean(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Study_Hours"].fillna(df["Study_Hours"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df["Attendance"].fillna(df["Attendance"].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate obj

In [11]:
print("Missing values after handling:")
print(df.isnull().sum())


Missing values after handling:
Age               0
Study_Hours       0
Attendance        0
Internal_Marks    0
Result            0
dtype: int64


In [13]:
#Apply Min–Max normalization
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

numerical_cols = ["Age", "Study_Hours", "Attendance", "Internal_Marks"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.describe()



Unnamed: 0,Age,Study_Hours,Attendance,Internal_Marks
count,100.0,100.0,100.0,100.0
mean,0.535,0.47069,0.52222,0.489102
std,0.318962,0.301195,0.301906,0.312606
min,0.0,0.0,0.0,0.0
25%,0.291667,0.227296,0.246287,0.210526
50%,0.5,0.429812,0.565015,0.481393
75%,0.833333,0.765605,0.75182,0.75
max,1.0,1.0,1.0,1.0


In [14]:
#Convert Categorical Attribute to Numerical
df["Result"] = df["Result"].map({"Pass": 1, "Fail": 0})
df.head()


Unnamed: 0,Age,Study_Hours,Attendance,Internal_Marks,Result
0,1.0,0.084546,0.689162,0.105263,0
1,0.5,0.194077,0.781609,0.368421,0
2,0.666667,0.040459,0.240277,0.684211,0
3,1.0,0.325881,0.7471,0.894737,1
4,0.333333,0.390431,0.374732,0.736842,0


TASK 3: Feature Selection

In [15]:
#Prepare Data for Feature Selection
X = df.drop("Result", axis=1)
y = df["Result"]


In [16]:
#compute Information Gain using entropy
from sklearn.feature_selection import mutual_info_classif

ig_scores = mutual_info_classif(X, y, random_state=42)

ig_df = pd.DataFrame({
    "Feature": X.columns,
    "Information_Gain": ig_scores
})

print(ig_df)


          Feature  Information_Gain
0             Age          0.012434
1     Study_Hours          0.120028
2      Attendance          0.128710
3  Internal_Marks          0.199411


In [17]:
#Identify the Best Attribute
best_feature = ig_df.loc[ig_df["Information_Gain"].idxmax()]
print("Best attribute for splitting:")
print(best_feature)


Best attribute for splitting:
Feature             Internal_Marks
Information_Gain          0.199411
Name: 3, dtype: object


TASK 4: Decision Tree Classification

In [18]:
#Split the Dataset
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [48]:
#Implement Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(
    criterion="entropy",
    random_state=42
)



In [49]:
#Train the Model
dt_model.fit(X_train, y_train)




TASK 5: Evaluation

In [50]:
#Predict on Test Data
y_pred = dt_model.predict(X_test)


In [51]:
#Compute Evaluation Metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Evaluation Metrics:")
print("Accuracy :", accuracy)
print("Precision:", precision)
print("Recall   :", recall)
print("F1-score :", f1)




Evaluation Metrics:
Accuracy : 0.95
Precision: 1.0
Recall   : 0.75
F1-score : 0.8571428571428571


The Decision Tree classifier achieved an accuracy of 95%.
Precision of 1.0 indicates no false positive predictions, while recall of 0.75 shows a few missed positive cases.
The F1-score of 0.86 confirms a well-balanced and reliable model.