In [2]:
# Student Academic Achievement Analysis - Data Loading
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

print("✅ All packages imported successfully!")

# Set style for better plots
plt.style.use('default')
sns.set_palette('husl')

# Try to load data from correct location
file_paths = [
    '../data/StudentPerformanceFactors (1).csv',  # From notebooks folder
    'data/StudentPerformanceFactors (1).csv',     # From root
    '../data/raw/StudentPerformanceFactors (1).csv',  # In case moved to raw
]

df = None
for path in file_paths:
    try:
        df = pd.read_csv(path)
        print(f"✅ Data loaded successfully from: {path}")
        break
    except Exception as e:
        print(f"❌ Failed to load from {path}")

if df is not None:
    print(f"Dataset Shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Average Exam Score: {df['Exam_Score'].mean():.1f}")
    print("\nFirst few rows:")
    print(df.head())
else:
    print("❌ Could not load data from any path. Please check file location.")

✅ All packages imported successfully!
✅ Data loaded successfully from: ../data/StudentPerformanceFactors (1).csv
Dataset Shape: (6607, 20)
Columns: ['Hours_Studied', 'Attendance', 'Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Sleep_Hours', 'Previous_Scores', 'Motivation_Level', 'Internet_Access', 'Tutoring_Sessions', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Physical_Activity', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender', 'Exam_Score']
Average Exam Score: 67.2

First few rows:
   Hours_Studied  Attendance Parental_Involvement Access_to_Resources  \
0             23          84                  Low                High   
1             19          64                  Low              Medium   
2             24          98               Medium              Medium   
3             29          89                  Low              Medium   
4             19          92               Medium    

# Student Academic Achievement Analysis

## Research Questions:
1. **What factors most strongly predict exam performance?**
2. **How do study habits (hours studied) impact academic achievement?**
3. **What's the relationship between parental involvement and student success?**

This notebook analyzes student performance data from 6,607 students to identify key factors affecting academic success.

In [None]:
# Data Overview and Quality Assessment
print("🎯 STUDENT PERFORMANCE DATASET OVERVIEW")
print("="*50)

print(f"📊 Dataset Dimensions:")
print(f"   • Total Students: {df.shape[0]:,}")
print(f"   • Total Features: {df.shape[1]}")
print(f"   • Memory Usage: {df.memory_usage().sum() / 1024:.1f} KB")

print(f"\n🎯 Target Variable (Exam_Score):")
print(f"   • Range: {df['Exam_Score'].min()} - {df['Exam_Score'].max()}")
print(f"   • Average: {df['Exam_Score'].mean():.1f}")
print(f"   • Std Dev: {df['Exam_Score'].std():.1f}")

print(f"\n🔍 Data Quality Check:")
missing_values = df.isnull().sum().sum()
print(f"   • Missing Values: {missing_values} (Perfect!)" if missing_values == 0 else f"   • Missing Values: {missing_values}")

print(f"\n📋 All Available Features:")
for i, col in enumerate(df.columns, 1):
    dtype = str(df[col].dtype)
    unique_vals = df[col].nunique()
    print(f"   {i:2d}. {col:<25} ({dtype:<8}) - {unique_vals} unique values")

In [None]:
# Research Question 1: What factors most predict exam performance?
print("🎯 RESEARCH QUESTION 1: Key Predictive Factors")
print("="*60)

# Analyze correlations with exam scores
numeric_cols = df.select_dtypes(include=[np.number]).columns
correlations = df[numeric_cols].corr()['Exam_Score'].sort_values(ascending=False)

print("🏆 TOP 10 FACTORS CORRELATED WITH EXAM SCORES:")
print("-" * 50)
for i, (factor, corr) in enumerate(correlations.head(11)[1:].items(), 1):
    direction = "📈 Positive" if corr > 0 else "📉 Negative"
    strength = "Strong" if abs(corr) > 0.5 else "Moderate" if abs(corr) > 0.3 else "Weak"
    print(f"{i:2d}. {factor:<25} {corr:+.3f} ({direction}, {strength})")

# Focus on top 3 factors
top_3_factors = correlations.head(4)[1:].index.tolist()  # Exclude Exam_Score itself
print(f"\n🎯 TOP 3 MOST PREDICTIVE FACTORS:")
for i, factor in enumerate(top_3_factors, 1):
    corr_value = correlations[factor]
    print(f"   {i}. {factor}: {corr_value:.3f}")

# Quick visualization of top factor
top_factor = top_3_factors[0]
plt.figure(figsize=(10, 6))
plt.scatter(df[top_factor], df['Exam_Score'], alpha=0.6, color='steelblue')
plt.xlabel(f'{top_factor} (Top Predictor)')
plt.ylabel('Exam Score')
plt.title(f'Relationship: {top_factor} vs Exam Performance\n(Correlation: {correlations[top_factor]:.3f})')
plt.grid(True, alpha=0.3)
plt.show()