<a href="https://colab.research.google.com/github/sageh9120/MSSP-6070/blob/main/week_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Import necessary libraries
import pandas as pd
from scipy import stats
import numpy as np

# Set the significance level (alpha)
ALPHA = 0.05

# --- Step 1: Data Loading and Preparation ---
print("--- Step 1: Data Loading and Preparation ---")

# Load the dataset from the CSV file
try:
    df = pd.read_csv('Student+Performance+Data.csv')
    print("✅ Data loaded successfully!")
except FileNotFoundError:
    print("❌ Error: 'Student+Performance+Data.csv' not found. Ensure the file is uploaded to the Colab environment.")
    raise

# Check for and calculate 'total score' if it's missing (though it appears to be present in the original file)
if 'total score' not in df.columns:
    df['total score'] = df['math score'] + df['reading score'] + df['writing score']
    print("✅ 'total score' column calculated.")
else:
    print("✅ 'total score' column confirmed.")

print(f"\nDataset size: {df.shape[0]} rows, {df.shape[1]} columns")
print("-------------------------------------------\n")


# --- Step 2: Answer Question 2 - Prep Course Impact on Males ---
# Question: Is there a statistical difference between males who took the preparation exam and others (in total score)?

print("--- Step 2: T-Test for Question 2: Prep Course Impact on Males (Total Score) ---")

# 1. Filter the dataset for male students only
males_df = df[df['gender'] == 'male']

# 2. Define the two groups' data for comparison
# Group A: Males who completed the preparation course
group_completed = males_df[males_df['test preparation course'] == 'completed']['total score']
# Group B: Males who took none of the preparation course
group_none = males_df[males_df['test preparation course'] == 'none']['total score']

# 3. Output Descriptive Statistics
print(f"Mean Total Score (Completed Prep): {group_completed.mean():.2f} (n={len(group_completed)})")
print(f"Mean Total Score (No Prep): {group_none.mean():.2f} (n={len(group_none)})")

# 4. Perform an Independent Samples t-test (using Welch's t-test: equal_var=False)
t_stat_q2, p_value_q2 = stats.ttest_ind(group_completed, group_none, equal_var=False)

# 5. Print T-Test Results and Interpretation
print("\n[T-Test Results (Question 2)]")
print(f"T-Statistic: {t_stat_q2:.4f}")
print(f"P-Value: {p_value_q2:.4f}")

if p_value_q2 < ALPHA:
    print(f"Conclusion: Reject the Null Hypothesis ($H_0$). Since p-value < {ALPHA}, there is a **statistically significant difference** in the total scores of males based on whether they completed the test preparation course.")
else:
    print(f"Conclusion: Fail to Reject the Null Hypothesis ($H_0$). Since p-value > {ALPHA}, there is **no statistically significant difference** in the total scores of males based on whether they completed the course.")

print("-------------------------------------------\n")


# --- Step 3: Answer Question 3 - Other Differences (Exploratory Analysis: Lunch Type) ---
# Question: Are there any other differences that are of interest? (Selected: Lunch Type vs. Total Score)

print("--- Step 3: T-Test for Question 3: Exploratory Analysis (Lunch Type vs. Total Score) ---")

# 1. Define the two groups' data for comparison
# Group A: Standard Lunch
group_standard_lunch = df[df['lunch'] == 'standard']['total score']
# Group B: Free/Reduced Lunch
group_free_reduced_lunch = df[df['lunch'] == 'free/reduced']['total score']

# 2. Output Descriptive Statistics
print(f"Mean Total Score (Standard Lunch): {group_standard_lunch.mean():.2f} (n={len(group_standard_lunch)})")
print(f"Mean Total Score (Free/Reduced Lunch): {group_free_reduced_lunch.mean():.2f} (n={len(group_free_reduced_lunch)})")

# 3. Perform an Independent Samples t-test
t_stat_q3, p_value_q3 = stats.ttest_ind(group_standard_lunch, group_free_reduced_lunch, equal_var=False)

# 4. Print T-Test Results and Interpretation
print("\n[T-Test Results (Question 3 - Lunch Type)]")
print(f"T-Statistic: {t_stat_q3:.4f}")
print(f"P-Value: {p_value_q3:.4f}")

if p_value_q3 < ALPHA:
    print(f"Conclusion: Reject the Null Hypothesis ($H_0$). Since p-value < {ALPHA}, there is a **statistically significant difference** in total scores between students who receive standard lunch and those who receive free/reduced lunch.")
else:
    print(f"Conclusion: Fail to Reject the Null Hypothesis ($H_0$). Since p-value > {ALPHA}, there is **no statistically significant difference** in total scores based on lunch type.")

print("\n--- Analysis Complete ---")

--- Step 1: Data Loading and Preparation ---
✅ Data loaded successfully!
✅ 'total score' column confirmed.

Dataset size: 1000 rows, 10 columns
-------------------------------------------

--- Step 2: T-Test for Question 2: Prep Course Impact on Males (Total Score) ---
Mean Total Score (Completed Prep): 212.34 (n=174)
Mean Total Score (No Prep): 189.13 (n=308)

[T-Test Results (Question 2)]
T-Statistic: 6.1814
P-Value: 0.0000
Conclusion: Reject the Null Hypothesis ($H_0$). Since p-value < 0.05, there is a **statistically significant difference** in the total scores of males based on whether they completed the test preparation course.
-------------------------------------------

--- Step 3: T-Test for Question 3: Exploratory Analysis (Lunch Type vs. Total Score) ---
Mean Total Score (Standard Lunch): 212.51 (n=645)
Mean Total Score (Free/Reduced Lunch): 186.60 (n=355)

[T-Test Results (Question 3 - Lunch Type)]
T-Statistic: 9.3232
P-Value: 0.0000
Conclusion: Reject the Null Hypothesis (