# Student Performance Analysis Project

## 1. Project Setup and Data Preparation
### 1.1 Set up environment

In [None]:
# Install necessary libraries (uncomment if needed)
# !pip install pandas numpy matplotlib seaborn scikit-learn plotly scipy streamlit

# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
from scipy import stats
from scipy.stats import f_oneway, chi2_contingency

# Configure plots for better display in Jupyter
%matplotlib inline
plt.style.use('seaborn-whitegrid')
sns.set_style('whitegrid')

### 1.2 Load and inspect the data

In [None]:
# Load the data
df = pd.read_csv('student_performance_large_dataset.csv')

# View basic information
print("Dataset information:")
print(df.info())
print("\nSummary statistics:")
print(df.describe())

# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())

### 1.3 Data cleaning

In [None]:
# Handle any missing values
df_cleaned = df.dropna()  # or use imputation methods if appropriate
print(f"Rows before cleaning: {len(df)}")
print(f"Rows after removing missing values: {len(df_cleaned)}")

# Check for duplicates
duplicates = df_cleaned.duplicated().sum()
print(f"Number of duplicate entries: {duplicates}")
df_cleaned = df_cleaned.drop_duplicates()
print(f"Rows after removing duplicates: {len(df_cleaned)}")

# Convert categorical variables to appropriate types
categorical_cols = ['Gender', 'Preferred_Learning_Style', 'Participation_in_Discussions', 
                   'Use_of_Educational_Tech', 'Self_Reported_Stress_Level', 'Final_Grade']
for col in categorical_cols:
    df_cleaned[col] = df_cleaned[col].astype('category')
    
# Verify the data types
print("\nUpdated data types:")
print(df_cleaned.dtypes)

# Save our cleaned dataframe for later use
df = df_cleaned

## 2. Exploratory Data Analysis (EDA)
### 2.1 Distribution of key variables

In [None]:
# Create histograms for numerical variables
numerical_cols = ['Age', 'Study_Hours_per_Week', 'Online_Courses_Completed', 
                 'Assignment_Completion_Rate (%)', 'Exam_Score (%)', 'Attendance_Rate (%)',
                 'Time_Spent_on_Social_Media (hours/week)', 'Sleep_Hours_per_Night']

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(2, 4, i)
    sns.histplot(df[col], kde=True)
    plt.title(f'Distribution of {col}')
plt.tight_layout()
plt.savefig('outputs/plots/numerical_distributions.png', dpi=300)
plt.show()

# Create count plots for categorical variables
plt.figure(figsize=(15, 10))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(2, 3, i)
    sns.countplot(x=col, data=df)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig('outputs/plots/categorical_distributions.png', dpi=300)
plt.show()

### 2.2 Correlation analysis

In [None]:
# Calculate correlations between numerical variables
correlation_matrix = df[numerical_cols].corr()

# Create a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Variables')
plt.savefig('outputs/plots/correlation_matrix.png', dpi=300)
plt.show()

### 2.3 Relationship between study hours and exam score

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Study_Hours_per_Week', y='Exam_Score (%)', data=df, hue='Gender', alpha=0.7)
plt.title('Relationship Between Study Hours and Exam Score')
plt.savefig('outputs/plots/study_hours_vs_exam_score.png', dpi=300)
plt.show()

### 2.4 Performance by learning style

In [None]:
plt.figure(figsize=(12, 6))
sns.boxplot(x='Preferred_Learning_Style', y='Exam_Score (%)', data=df)
plt.title('Exam Scores by Learning Style')
plt.savefig('outputs/plots/exam_scores_by_learning_style.png', dpi=300)
plt.show()

### 2.5 Impact of sleep and social media

In [None]:
fig = plt.figure(figsize=(15, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x='Sleep_Hours_per_Night', y='Exam_Score (%)', data=df, hue='Self_Reported_Stress_Level')
plt.title('Sleep Hours vs Exam Score')

plt.subplot(1, 2, 2)
sns.scatterplot(x='Time_Spent_on_Social_Media (hours/week)', y='Exam_Score (%)', data=df, hue='Self_Reported_Stress_Level')
plt.title('Social Media Usage vs Exam Score')

plt.tight_layout()
plt.savefig('outputs/plots/sleep_social_media_impact.png', dpi=300)
plt.show()

## 3. Statistical Analysis
### 3.1 Hypothesis testing - Effect of participation in discussions

In [None]:
from scipy import stats

# Compare exam scores for students who participate in discussions vs those who don't
participants = df[df['Participation_in_Discussions'] == 'Yes']['Exam_Score (%)']
non_participants = df[df['Participation_in_Discussions'] == 'No']['Exam_Score (%)']

t_stat, p_value = stats.ttest_ind(participants, non_participants)
print(f"T-test results: t-statistic = {t_stat:.4f}, p-value = {p_value:.4f}")
print(f"Mean exam score for participants: {participants.mean():.2f}%")
print(f"Mean exam score for non-participants: {non_participants.mean():.2f}%")

### 3.2 ANOVA - Comparing learning styles

In [None]:
from scipy.stats import f_oneway

# Create groups based on learning styles
visual = df[df['Preferred_Learning_Style'] == 'Visual']['Exam_Score (%)']
auditory = df[df['Preferred_Learning_Style'] == 'Auditory']['Exam_Score (%)']
reading_writing = df[df['Preferred_Learning_Style'] == 'Reading/Writing']['Exam_Score (%)']
kinesthetic = df[df['Preferred_Learning_Style'] == 'Kinesthetic']['Exam_Score (%)']

# Perform ANOVA
f_stat, p_value = f_oneway(visual, auditory, reading_writing, kinesthetic)
print(f"ANOVA results: F-statistic = {f_stat:.4f}, p-value = {p_value:.4f}")
print(f"Mean exam scores by learning style:")
print(f"Visual: {visual.mean():.2f}%")
print(f"Auditory: {auditory.mean():.2f}%")
print(f"Reading/Writing: {reading_writing.mean():.2f}%")
print(f"Kinesthetic: {kinesthetic.mean():.2f}%")

### 3.3 Chi-squared test - Learning style and final grade

In [None]:
from scipy.stats import chi2_contingency

# Create a contingency table
contingency_table = pd.crosstab(df['Preferred_Learning_Style'], df['Final_Grade'])
print("Contingency Table:")
print(contingency_table)

# Perform chi-squared test
chi2, p, dof, expected = chi2_contingency(contingency_table)
print(f"Chi-squared test results: chi2 = {chi2:.4f}, p-value = {p:.4f}")

## 4. Predictive Modeling
### 4.1 Data preparation for modeling

In [None]:
# Define features and target
X = df.drop(['Student_ID', 'Exam_Score (%)', 'Final_Grade'], axis=1)
y = df['Exam_Score (%)']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

# Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['category']).columns.tolist()
print(f"Numerical columns: {len(numerical_cols)}")
print(f"Categorical columns: {len(categorical_cols)}")

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

### 4.2 Linear Regression model

In [None]:
# Create pipeline with preprocessing and linear regression
lr_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Train the model
lr_pipeline.fit(X_train, y_train)

# Make predictions
y_pred_lr = lr_pipeline.predict(X_test)

# Evaluate the model
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print(f"Linear Regression Results:")
print(f"Mean Squared Error: {mse_lr:.2f}")
print(f"R² Score: {r2_lr:.2f}")

# Save the model
import joblib
joblib.dump(lr_pipeline, 'outputs/models/linear_regression_model.pkl')

### 4.3 Random Forest model|

In [None]:
# Create pipeline with preprocessing and random forest
rf_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
rf_pipeline.fit(X_train, y_train)

# Make predictions
y_pred_rf = rf_pipeline.predict(X_test)

# Evaluate the model
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Results:")
print(f"Mean Squared Error: {mse_rf:.2f}")
print(f"R² Score: {r2_rf:.2f}")

# Save the model
joblib.dump(rf_pipeline, 'outputs/models/random_forest_model.pkl')

### 4.4 Feature importance analysis

In [None]:
# Extract feature names after one-hot encoding
feature_names = []

# Get the names of the features after one-hot encoding
ohe = preprocessor.named_transformers_['cat']
for i, col in enumerate(categorical_cols):
    feature_names.extend([f"{col}_{category}" for category in 
                         ohe.categories_[i]])

# Add the numerical column names
feature_names.extend(numerical_cols)

# Get feature importances from the random forest model
rf_model = rf_pipeline.named_steps['regressor']
importances = rf_model.feature_importances_

# Create DataFrame for easier manipulation
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importances
plt.figure(figsize=(12, 8))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df.head(15))
plt.title('Top 15 Feature Importances')
plt.tight_layout()
plt.savefig('outputs/plots/feature_importances.png', dpi=300)
plt.show()

## 5. Advanced Visualizations
### 5.1 Create interactive plots with Plotly

In [None]:
# Interactive scatter plot for study hours vs exam score by learning style
fig = px.scatter(df, x='Study_Hours_per_Week', y='Exam_Score (%)', 
                color='Preferred_Learning_Style', size='Attendance_Rate (%)',
                hover_data=['Student_ID', 'Age', 'Gender', 'Final_Grade'],
                title='Study Hours vs Exam Score by Learning Style')
fig.write_html('outputs/interactive/interactive_scatter.html')
# Display in notebook
fig.show()

# Interactive box plot for exam score by stress level and gender
fig = px.box(df, x='Self_Reported_Stress_Level', y='Exam_Score (%)', 
            color='Gender', title='Exam Scores by Stress Level and Gender')
fig.write_html('outputs/interactive/stress_gender_boxplot.html')
# Display in notebook
fig.show()

# Interactive heatmap for average exam score by study hours and sleep
study_sleep_heatmap = df.pivot_table(
    values='Exam_Score (%)', 
    index=pd.cut(df['Study_Hours_per_Week'], bins=10), 
    columns=pd.cut(df['Sleep_Hours_per_Night'], bins=7),
    aggfunc='mean'
)

fig = px.imshow(study_sleep_heatmap, 
                labels=dict(x="Sleep Hours", y="Study Hours", color="Exam Score (%)"),
                title="Average Exam Score by Study Hours and Sleep Duration")
fig.write_html('outputs/interactive/study_sleep_heatmap.html')
# Display in notebook
fig.show()

## 6. Findings and Report Creation
### 6.1 Compile key findings

In [None]:
# Compile and save key findings
with open('outputs/key_findings.txt', 'w') as f:
    f.write("# Key Findings from Student Performance Analysis\n\n")
    
    f.write("## Correlation Analysis\n")
    f.write("- Study Hours and Exam Score correlation: {:.2f}\n".format(
        df['Study_Hours_per_Week'].corr(df['Exam_Score (%)'])))
    f.write("- Attendance and Exam Score correlation: {:.2f}\n".format(
        df['Attendance_Rate (%)'].corr(df['Exam_Score (%)'])))
    f.write("- Social Media and Exam Score correlation: {:.2f}\n\n".format(
        df['Time_Spent_on_Social_Media (hours/week)'].corr(df['Exam_Score (%)'])))
    
    f.write("## Learning Style Analysis\n")
    for style in df['Preferred_Learning_Style'].unique():
        avg_score = df[df['Preferred_Learning_Style'] == style]['Exam_Score (%)'].mean()
        f.write("- {} learners average score: {:.2f}%\n".format(style, avg_score))
    f.write("\n")
    
    f.write("## Impact of Participation in Discussions\n")
    participants_avg = df[df['Participation_in_Discussions'] == 'Yes']['Exam_Score (%)'].mean()
    non_participants_avg = df[df['Participation_in_Discussions'] == 'No']['Exam_Score (%)'].mean()
    f.write("- Participants average score: {:.2f}%\n".format(participants_avg))
    f.write("- Non-participants average score: {:.2f}%\n\n".format(non_participants_avg))
    
    f.write("## Study Hours Analysis\n")
    df['Study_Hours_Category'] = pd.cut(df['Study_Hours_per_Week'], 
                                       bins=[0, 10, 20, 30, 40, 50],
                                       labels=['<10 hrs', '10-20 hrs', '20-30 hrs', '30-40 hrs', '40+ hrs'])
    for category in df['Study_Hours_Category'].unique():
        avg_score = df[df['Study_Hours_Category'] == category]['Exam_Score (%)'].mean()
        f.write("- {} average score: {:.2f}%\n".format(category, avg_score))
    f.write("\n")
    
    f.write("## Model Performance\n")
    f.write("- Linear Regression R² Score: {:.2f}\n".format(r2_lr))
    f.write("- Random Forest R² Score: {:.2f}\n".format(r2_rf))
    f.write("\n")
    
    # Add top 5 important features
    f.write("## Top 5 Important Features\n")
    for i in range(5):
        f.write("- {}: {:.4f}\n".format(
            feature_importance_df.iloc[i]['Feature'], 
            feature_importance_df.iloc[i]['Importance']))

# Read back the file to display in the notebook
with open('outputs/key_findings.txt', 'r') as f:
    print(f.read())

## Running the Dashboard