In [1]:
import os

# Create folders
folders = ["raw_data", "cleaned_data", "source_code", "results"]
for folder in folders:
    os.makedirs(folder, exist_ok=True)

# Verify folder creation
!ls

cleaned_data  raw_data	results  sample_data  source_code


In [2]:
import pandas as pd

# Load raw data
raw_data_path = "raw_data/StudentsPerformance.csv"
df = pd.read_csv(raw_data_path)

# Check for missing values
print("Missing values:\n", df.isnull().sum())

# Check for duplicates
print("Duplicates:", df.duplicated().sum())

# Save cleaned data
cleaned_data_path = "cleaned_data/cleaned_students_performance.csv"
df.to_csv(cleaned_data_path, index=False)

print("Cleaned data saved to:", cleaned_data_path)

Missing values:
 gender                         0
race/ethnicity                 0
parental level of education    0
lunch                          0
test preparation course        0
math score                     0
reading score                  0
writing score                  0
dtype: int64
Duplicates: 0
Cleaned data saved to: cleaned_data/cleaned_students_performance.csv


In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load cleaned data
cleaned_data_path = "cleaned_data/cleaned_students_performance.csv"
df = pd.read_csv(cleaned_data_path)

# Visualization 1: Distribution of Math Scores
plt.figure(figsize=(8, 6))
sns.histplot(df['math score'], bins=20, kde=True, color='blue')
plt.title('Distribution of Math Scores')
plt.xlabel('Math Score')
plt.ylabel('Frequency')
plt.savefig('results/visualization1.png')
plt.close()

# Visualization 2: Average Scores by Gender
avg_scores_by_gender = df.groupby('gender')[['math score', 'reading score', 'writing score']].mean()
avg_scores_by_gender.plot(kind='bar', figsize=(10, 6))
plt.title('Average Scores by Gender')
plt.xlabel('Gender')
plt.ylabel('Average Score')
plt.savefig('results/visualization2.png')
plt.close()

# Visualization 3: Correlation Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df[['math score', 'reading score', 'writing score']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap of Scores')
plt.savefig('results/visualization3.png')
plt.close()

# Visualization 4: Boxplot of Math Scores by Parental Education
plt.figure(figsize=(10, 6))
sns.boxplot(x='parental level of education', y='math score', data=df)
plt.title('Math Scores by Parental Education Level')
plt.xticks(rotation=45)
plt.savefig('results/visualization4.png')
plt.close()

# Visualization 5: Scatter Plot of Reading vs. Writing Scores
plt.figure(figsize=(8, 6))
sns.scatterplot(x='reading score', y='writing score', hue='gender', data=df)
plt.title('Reading vs. Writing Scores by Gender')
plt.savefig('results/visualization5.png')
plt.close()

print("Visualizations saved to the results folder.")

Visualizations saved to the results folder.
