In [5]:
import pandas as pd
import numpy as np

np.random.seed(42)

data = {
    "Age": np.random.randint(18, 25, 100),
    "Study_Hours": np.random.uniform(1, 8, 100).round(1),
    "Attendance": np.random.randint(50, 100, 100),
    "Internal_Marks": np.random.randint(20, 50, 100),
}

data["Result"] = [
    "Pass" if (m >= 30 and a >= 75) else "Fail"
    for m, a in zip(data["Internal_Marks"], data["Attendance"])
]

df = pd.DataFrame(data)
df.to_csv("student_performance.csv", index=False)

df


Unnamed: 0,Age,Study_Hours,Attendance,Internal_Marks,Result
0,24,1.6,53,31,Fail
1,21,2.4,60,46,Fail
2,22,1.3,66,38,Fail
3,24,3.3,87,41,Pass
4,20,3.7,73,42,Fail
...,...,...,...,...,...
95,23,2.7,56,44,Fail
96,21,2.0,52,23,Fail
97,23,4.4,66,32,Fail
98,24,7.9,82,39,Pass


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.feature_selection import mutual_info_classif


In [7]:
df = pd.read_csv("student_performance.csv")

print("Number of records:", df.shape[0])
print("Number of attributes:", df.shape[1])

print("\nClass Distribution:")
print(df["Result"].value_counts())


Number of records: 100
Number of attributes: 5

Class Distribution:
Result
Fail    70
Pass    30
Name: count, dtype: int64


In [9]:
# Handle missing values
df.fillna(df.mean(numeric_only=True), inplace=True)

# Encode target column
le = LabelEncoder()
df["Result"] = le.fit_transform(df["Result"])

# Normalize numerical attributes
scaler = MinMaxScaler()
num_cols = ["Age", "Study_Hours", "Attendance", "Internal_Marks"]
df[num_cols] = scaler.fit_transform(df[num_cols])

df


Unnamed: 0,Age,Study_Hours,Attendance,Internal_Marks,Result
0,1.000000,0.086957,0.061224,0.379310,0
1,0.500000,0.202899,0.204082,0.896552,0
2,0.666667,0.043478,0.326531,0.620690,0
3,1.000000,0.333333,0.755102,0.724138,1
4,0.333333,0.391304,0.469388,0.758621,0
...,...,...,...,...,...
95,0.833333,0.246377,0.122449,0.827586,0
96,0.500000,0.144928,0.040816,0.103448,0
97,0.833333,0.492754,0.326531,0.413793,0
98,1.000000,1.000000,0.653061,0.655172,1


In [10]:
X = df.drop("Result", axis=1)
y = df["Result"]

info_gain = mutual_info_classif(X, y)

for col, ig in zip(X.columns, info_gain):
    print(f"Information Gain of {col}: {ig}")

best_feature = X.columns[info_gain.argmax()]
print("\nBest attribute for splitting:", best_feature)


Information Gain of Age: 0.02615918304339493
Information Gain of Study_Hours: 0.01361630807574854
Information Gain of Attendance: 0.26693857730978277
Information Gain of Internal_Marks: 0.0985861242073296

Best attribute for splitting: Attendance


In [11]:
X = df.drop("Result", axis=1)
y = df["Result"]

info_gain = mutual_info_classif(X, y)

for col, ig in zip(X.columns, info_gain):
    print(f"Information Gain of {col}: {ig}")

best_feature = X.columns[info_gain.argmax()]
print("\nBest attribute for splitting:", best_feature)


Information Gain of Age: 0.013200223315472215
Information Gain of Study_Hours: 0.01703597174541227
Information Gain of Attendance: 0.22119579231699737
Information Gain of Internal_Marks: 0.11931195393315908

Best attribute for splitting: Attendance


In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

dt = DecisionTreeClassifier(criterion="entropy")
dt.fit(X_train, y_train)


In [13]:
y_pred = dt.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))


Accuracy: 1.0
Precision: 1.0
Recall: 1.0
F1-score: 1.0
