In [6]:
import pandas as pd

df = pd.read_csv(r"D:\Python_rnw\one\mathematics\student_dataset_150_rows.csv")

print(df.head())
print(df.info())

   study_hours  attendance group_discussion  previous_test_score  \
0           39          82               No                   37   
1           29          50              Yes                   55   
2           15          76              Yes                   63   
3            8          62               No                   36   
4           21          90              Yes                   97   

  final_exam_pass  
0            Pass  
1            Pass  
2            Pass  
3            Pass  
4            Pass  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   study_hours          150 non-null    int64 
 1   attendance           150 non-null    int64 
 2   group_discussion     150 non-null    object
 3   previous_test_score  150 non-null    int64 
 4   final_exam_pass      150 non-null    object
dtypes: int64(3), object

In [16]:
pass_count = (df["final_exam_pass"] == "Pass").sum()
p = pass_count / n
q = 1-p
print(pass_count , p_pass_emp)
# 143, about 0.9533

143 0.9533333333333334


In [17]:
import math

probs = []
for k in range(4):          # k = 0,1,2,3 passes
    comb = math.comb(3, k)
    pk = comb * (p**k) * (q**(3-k))
    probs.append({"k": k, "P(X=k)": pk})

dist = pd.DataFrame(probs)
print(dist)


   k    P(X=k)
0  0  0.000102
1  1  0.006228
2  2  0.127238
3  3  0.866432


In [18]:
mean_X = 3 * p
var_X = 3 * p * q

print("Mean:", mean_X)
print("Variance:", var_X)


Mean: 2.8600000000000003
Variance: 0.1334666666666666


In [19]:
# A: study more than 10 hours/week
A = df["study_hours"] > 10

# B: attendance more than 80%
B = df["attendance"] > 80

n = len(df)

A_count   = A.sum()          # in A
B_count   = B.sum()          # in B
AB_count  = (A & B).sum()    # in A ∩ B

only_A    = A_count - AB_count
only_B    = B_count - AB_count
neither   = n - (only_A + only_B + AB_count)

print("Total students:", n)
print("Only A (study>10):", only_A)
print("Only B (attend>80):", only_B)
print("Both A and B:", AB_count)
print("Neither A nor B:", neither)


Total students: 150
Only A (study>10): 64
Only B (attend>80): 15
Both A and B: 49
Neither A nor B: 22


In [None]:
ct = pd.crosstab(df["group_discussion"], df["final_exam_pass"])

# Joint: P(Participates AND Passes)
joint = ct.loc["Yes", "Pass"] / n          # 89 / 150 ≈ 0.5933

# Marginal: P(Passes)
marginal = ct["Pass"].sum() / n            # 143 / 150 ≈ 0.9533

# Conditional: P(Passes | Participates)
conditional = ct.loc["Yes", "Pass"] / ct.loc["Yes"].sum()   # 89 / 92 ≈ 0.9674

print("Joint:", joint)
print("Marginal:", marginal)
print("Conditional:", conditional)


Joint: 0.5933333333333334
Marginal: 0.9533333333333334
Conditional: 0.967391304347826


In [25]:
# contingency table
ct = pd.crosstab(df["group_discussion"], df["final_exam_pass"])

# P(Pass)
p_pass = ct["Pass"].sum() / n

# P(Pass | Participates)
p_pass_given_yes = ct.loc["Yes", "Pass"] / ct.loc["Yes"].sum()

print("P(Pass) =", p_pass)
print("P(Pass | Participates) =", p_pass_given_yes)

if abs(p_pass_given_yes - p_pass) < 1e-6:
    relation = "independent"
else:
    relation = "dependent"

print("Relationship:", relation)

# Check mutually exclusive (they would have 0 overlap)
overlap = ct.loc["Yes", "Pass"]
print("Overlap count (Yes AND Pass):", overlap)


P(Pass) = 0.9533333333333334
P(Pass | Participates) = 0.967391304347826
Relationship: dependent
Overlap count (Yes AND Pass): 89


In [24]:
P_H_given_P = 0.70
P_H_given_F = 0.40
P_H = 0.60

# Let P(Pass) = x, P(Fail) = 1 - x
# From total probability:
# P(H) = P(H|P) * x + P(H|F) * (1 - x)
# 0.60 = 0.70x + 0.40(1 - x)
# 0.60 = 0.70x + 0.40 - 0.40x = 0.40 + 0.30x
# 0.20 = 0.30x  ->  x = 2/3

P_Pass = 2/3
P_Fail = 1 - P_Pass

P_Pass_given_H = (P_H_given_P * P_Pass) / P_H

print("P(Pass) =", P_Pass)
print("P(Fail) =", P_Fail)
print("P(Pass | High attendance) =", P_Pass_given_H)


P(Pass) = 0.6666666666666666
P(Fail) = 0.33333333333333337
P(Pass | High attendance) = 0.7777777777777777


                                                                Summary

This analysis applied key probability concepts to a dataset of 150 students. It identified examples of different types of events, calculated empirical and theoretical probabilities, and defined a random variable representing how many students pass the final exam out of a group of three. Using this, a probability distribution was constructed to study the likelihood of 0–3 passes. A Venn diagram was used to show the overlap between students with high study hours and high attendance. A contingency table helped calculate joint, marginal, and conditional probabilities for group discussion participation and exam performance. Finally, Bayes’ theorem was used to determine the probability that a student passed the exam given that they had high attendance. Overall, the work demonstrates how probability tools can be applied to real student data to understand academic patterns and relationships.