# Student Performance Prediction
This notebook performs exploratory data analysis and compares machine learning models for predicting student performance.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

In [None]:
# Load dataset
# Assuming data is in a CSV format
df = pd.read_csv('student_data.csv')

In [None]:
# Create 'G3' column
# Assuming 'G3' is the final grade based on some existing columns like 'G1' and 'G2'
df['G3'] = (df['G1'] + df['G2']) / 2

In [None]:
# Handle missing values by dropping rows with NaNs
df.dropna(inplace=True)

In [None]:
# Exploratory Data Analysis (EDA)

# Boxplot of G3 scores
sns.boxplot(x='G3', data=df)
plt.title('Boxplot of G3 Scores')
plt.show()

In [None]:
# Correlation heatmap using only numeric columns
correlation_matrix = df.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Model Comparison
X = df.drop(['G3'], axis=1)
Y = df['G3']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor()
}

results = {}
for name, model in models.items():
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)
    results[name] = r2_score(Y_test, predictions)

In [None]:
# Plotting comparison of models
plt.bar(results.keys(), results.values())
plt.title('Model Comparison based on R² Score')
plt.ylabel('R² Score')
plt.show()

In [None]:
# Automatic selection of the best model
best_model_name = max(results, key=results.get)
print(f'The best model is: {best_model_name}')

In [None]:
# Reproducibility settings
import random
import numpy as np

random.seed(42)
np.random.seed(42)

Further analysis can go here...