# üéì Student Performance Data Analysis

This notebook explores the relationship between demographic and social factors and student performance using Python, SQL, and data visualization.

In [None]:

# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('../data/raw/StudentsPerformance.csv')

# Clean column names
df.columns = [col.strip().lower().replace(" ", "_") for col in df.columns]

# Check for missing values
print(df.isnull().sum())

# Create average score column
df['average_score'] = df[['math_score', 'reading_score', 'writing_score']].mean(axis=1)

# Save cleaned data
df.to_csv('../data/cleaned/students_performance_cleaned.csv', index=False)

# Display info
print(df.head())
print(df.info())


## üìä Average Score by Gender

In [None]:
print(df.groupby('gender')['average_score'].mean().reset_index())

## üéì Performance by Parental Level of Education

In [None]:
print(df.groupby('parental_level_of_education')['average_score'].mean().sort_values(ascending=False))

## üìò Impact of Test Preparation Course

In [None]:
print(df.groupby('test_preparation_course')['average_score'].agg(['mean', 'count']))

## üìà Score Distribution Visualization

In [None]:

plt.figure(figsize=(10,6))
sns.histplot(df['average_score'], bins=20, kde=True)
plt.title('Distribution of Average Scores')
plt.xlabel('Average Score')
plt.ylabel('Frequency')
plt.savefig('../visualizations/score_distribution.png')
plt.show()


## üß† Statistical Testing: Does Test Prep Help?

In [None]:

from scipy.stats import ttest_ind

prep = df[df['test_preparation_course'] == 'completed']['average_score']
no_prep = df[df['test_preparation_course'] == 'none']['average_score']

t_stat, p_val = ttest_ind(prep, no_prep)
print(f"T-statistic: {t_stat}, P-value: {p_val}")


## üóÉÔ∏è SQL: Load Cleaned Data and Query for Insights

In [None]:

import sqlite3

# Connect to SQLite DB
conn = sqlite3.connect('../sql/students.db')

# Load cleaned data into table
df.to_sql('students_performance', conn, if_exists='replace', index=False)

# Example query
query = '''
SELECT gender, AVG(average_score) AS avg_score
FROM students_performance
GROUP BY gender;
'''

result = pd.read_sql(query, conn)
print(result)


## ‚úÖ Conclusion
This analysis reveals insights into how factors like test prep, gender, and socioeconomic indicators affect academic performance. These insights can inform future educational strategies and policy decisions.