<a href="https://colab.research.google.com/github/shokhista98/student-score-analysis/blob/main/Student_Score_Statistics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Students Score Statistics!



In [1]:
!pip install --upgrade nbformat nbconvert



In [3]:
import pandas as pd
from scipy.stats import shapiro, ttest_ind
import numpy as np # Often useful, though not strictly needed if pandas handles arrays

# --- 1. Null Hypothesis (H0) for the main comparison ---
# H0: There is no significant difference in the mean scores between female and male students.
# H1: There is a significant difference in the mean scores between female and male students.

# --- Load Data from GitHub ---
# Replace with your actual raw GitHub CSV URL
csv_url = 'https://raw.githubusercontent.com/shokhista98/student-score-analysis/refs/heads/main/students_scores.csv'
# Example (replace with your actual link after uploading):
# csv_url = 'https://raw.githubusercontent.com/your_username/your_repo/main/student_scores.csv'

try:
    df = pd.read_csv(csv_url)
except Exception as e:
    print(f"Error loading CSV from URL: {e}")
    print("Please ensure your CSV is uploaded to GitHub and the raw URL is correct.")
    # As a fallback for local testing if GitHub fails, you can try loading a local file:
    # try:
    #     df = pd.read_csv('student_scores.csv')
    #     print("Loaded data from local 'student_scores.csv' as a fallback.")
    # except FileNotFoundError:
    #     print("Fallback 'student_scores.csv' not found locally either. Exiting.")
    #     exit()
    exit()


print("--- Data Head ---")
print(df.head())
print(f"\nTotal observations: {len(df)}")
print("\n--- Data Info ---")
df.info()
print("\n" + "="*50 + "\n")

# --- 2. Normality Test (Shapiro-Wilk Test) on the 'Score' column for all 40 students ---
# H0 (Normality): The 'Score' data (for all 40 students combined) is normally distributed.
# H1 (Normality): The 'Score' data is not normally distributed.

# Check if 'Score' column exists and has numeric data
if 'Score' not in df.columns:
    print("Error: 'Score' column not found in the CSV.")
    exit()
if not pd.api.types.is_numeric_dtype(df['Score']):
    print(f"Error: 'Score' column is not numeric. Data type: {df['Score'].dtype}")
    # Attempt to convert to numeric, coercing errors to NaN
    df['Score'] = pd.to_numeric(df['Score'], errors='coerce')
    if df['Score'].isnull().any():
        print("Warning: Some 'Score' values could not be converted to numeric and were set to NaN.")
        df.dropna(subset=['Score'], inplace=True) # Remove rows with NaN scores for the test
        print(f"Proceeding with {len(df['Score'])} valid scores for normality test.")

if len(df['Score']) < 3: # Shapiro-Wilk needs at least 3 samples
    print("Error: Not enough valid numeric data in 'Score' column for Shapiro-Wilk test (need at least 3).")
    exit()

print("--- Shapiro-Wilk Normality Test (on all scores) ---")
# The Shapiro-Wilk test is appropriate here because N=40 (which is < 50).
# It tests the null hypothesis that the data was drawn from a normal distribution.
statistic_sw, p_value_sw = shapiro(df['Score'])

print(f"Shapiro-Wilk Test Statistic: {statistic_sw:.4f}")
print(f"P-value: {p_value_sw:.4f}")

alpha_normality = 0.05
if p_value_sw > alpha_normality:
    print(f"Interpretation (p={p_value_sw:.4f} > alpha={alpha_normality}):")
    print("Fail to reject the null hypothesis for normality (H0).")
    print("The 'Score' data appears to be normally distributed.")
else:
    print(f"Interpretation (p={p_value_sw:.4f} <= alpha={alpha_normality}):")
    print("Reject the null hypothesis for normality (H0).")
    print("The 'Score' data does not appear to be normally distributed.")
print("\n" + "="*50 + "\n")

# --- 3. Unpaired Student's t-test ---
# This test is performed regardless of the normality test outcome, as per instructions.
# It compares the means of two independent groups.

print("--- Unpaired Student's t-test (Female vs. Male Scores) ---")
# H0 (t-test): There is no significant difference in the mean scores between female and male students.
# H1 (t-test): There is a significant difference in the mean scores between female and male students.

# Ensure 'Gender' column exists
if 'Gender' not in df.columns:
    print("Error: 'Gender' column not found in the CSV.")
    exit()

female_scores = df[df['Gender'] == 'Female']['Score'].dropna() # Drop NaN scores if any
male_scores = df[df['Gender'] == 'Male']['Score'].dropna()     # Drop NaN scores if any

if len(female_scores) < 2 or len(male_scores) < 2:
    print("Error: Not enough data in one or both groups for t-test (need at least 2 per group).")
    print(f"Female scores count: {len(female_scores)}, Male scores count: {len(male_scores)}")
    exit()

t_statistic, p_value_ttest = ttest_ind(female_scores, male_scores)

print(f"Mean score for Females: {female_scores.mean():.2f}")
print(f"Mean score for Males: {male_scores.mean():.2f}")
print(f"T-statistic: {t_statistic:.4f}")
print(f"P-value: {p_value_ttest:.4f}")

alpha_ttest = 0.05
if p_value_ttest > alpha_ttest:
    print(f"Interpretation (p={p_value_ttest:.4f} > alpha={alpha_ttest}):")
    print("Fail to reject the null hypothesis (H0) for the t-test.")
    print("There is no statistically significant difference in mean scores between female and male students.")
else:
    print(f"Interpretation (p={p_value_ttest:.4f} <= alpha={alpha_ttest}):")
    print("Reject the null hypothesis (H0) for the t-test.")
    print("There is a statistically significant difference in mean scores between female and male students.")

print("\n--- End of Analysis ---")

--- Data Head ---
   Gender  Score
0  Female   92.6
1  Female   79.0
2  Female   84.8
3  Female   97.4
4  Female   93.7

Total observations: 40

--- Data Info ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Gender  40 non-null     object 
 1   Score   40 non-null     float64
dtypes: float64(1), object(1)
memory usage: 772.0+ bytes


--- Shapiro-Wilk Normality Test (on all scores) ---
Shapiro-Wilk Test Statistic: 0.9648
P-value: 0.2439
Interpretation (p=0.2439 > alpha=0.05):
Fail to reject the null hypothesis for normality (H0).
The 'Score' data appears to be normally distributed.


--- Unpaired Student's t-test (Female vs. Male Scores) ---
Mean score for Females: 80.68
Mean score for Males: 70.67
T-statistic: 2.6310
P-value: 0.0122
Interpretation (p=0.0122 <= alpha=0.05):
Reject the null hypothesis (H0) for the t-test.
There is a statistically signifi