In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Use raw string to avoid issues with backslashes
df = pd.read_csv(r"C:\Users\charl\Documents\Ironhack\Week7\Project\MikeWheeler\data\student-mat.csv", sep=';')



In [None]:
df.head()


In [None]:
df.shape

In [None]:
df.info

In [None]:
df.columns

In [None]:
df.describe

In [None]:
print("\nMissing values:\n", df.isnull().sum())

In [None]:
plt.figure(figsize=(6,4))
sns.histplot(df['G3'], bins=20, kde=True)
plt.title('Distribution of Final Grades (G3)')
plt.xlabel('Final Grade')
plt.ylabel('Count')
plt.show()

print("Mean G3:", df['G3'].mean())
print("Min/Max G3:", df['G3'].min(), df['G3'].max())


In [None]:
df_clean = df.copy()

In [None]:
df_clean['past_grade_avg'] = (df_clean['G1'] + df_clean['G2']) / 2


df_clean['absence_rate'] = df_clean['absences'] / (df_clean['studytime'] + 1)


df_clean['failures_binary'] = (df_clean['failures'] > 0).astype(int)


df_clean['study_effort'] = df_clean['studytime'] * df_clean['past_grade_avg']


In [None]:
categorical_cols = df_clean.select_dtypes(include='object').columns
print("Categorical columns:", list(categorical_cols))


df_encoded = pd.get_dummies(df_clean, columns=categorical_cols, drop_first=True)

print("Shape after encoding:", df_encoded.shape)


In [None]:
from sklearn.model_selection import train_test_split


X = df_encoded.drop(columns=['G3'])
y = df_encoded['G3']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Initialize model
lr_model = LinearRegression()

# Train model
lr_model.fit(X_train, y_train)

# Predictions
y_pred_lr = lr_model.predict(X_test)

# Evaluation metrics
lr_mae = mean_absolute_error(y_test, y_pred_lr)
lr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_lr))
lr_r2 = r2_score(y_test, y_pred_lr)

print("Linear Regression Performance")
print(f"MAE:  {lr_mae:.2f}")
print(f"RMSE: {lr_rmse:.2f}")
print(f"R²:   {lr_r2:.3f}")


In [None]:
from sklearn.tree import DecisionTreeRegressor

# Initialize model
dt_model = DecisionTreeRegressor(random_state=42)

# Train model
dt_model.fit(X_train, y_train)

# Predictions
y_pred_dt = dt_model.predict(X_test)

# Evaluation metrics
dt_mae = mean_absolute_error(y_test, y_pred_dt)
dt_rmse = np.sqrt(mean_squared_error(y_test, y_pred_dt))
dt_r2 = r2_score(y_test, y_pred_dt)

print("Decision Tree Performance")
print(f"MAE:  {dt_mae:.2f}")
print(f"RMSE: {dt_rmse:.2f}")
print(f"R²:   {dt_r2:.3f}")


In [None]:
from sklearn.model_selection import cross_val_score

# Cross-validation for Linear Regression
lr_cv_scores = cross_val_score(lr_model, X, y, cv=5, scoring='r2')

# Cross-validation for Decision Tree
dt_cv_scores = cross_val_score(dt_model, X, y, cv=5, scoring='r2')

print("Cross-Validation R² Scores")
print(f"Linear Regression CV Mean: {lr_cv_scores.mean():.3f}")
print(f"Decision Tree CV Mean:     {dt_cv_scores.mean():.3f}")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,5))
plt.scatter(y_test, y_pred_lr, alpha=0.7)
plt.plot([0, 20], [0, 20])
plt.xlabel("Actual G3")
plt.ylabel("Predicted G3")
plt.title("Linear Regression: Actual vs Predicted")
plt.show()


In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42
)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

rf_mae = mean_absolute_error(y_test, y_pred_rf)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

print("Random Forest Performance")
print(f"MAE:  {rf_mae:.2f}")
print(f"RMSE: {rf_rmse:.2f}")
print(f"R²:   {rf_r2:.3f}")


In [None]:
from sklearn.ensemble import GradientBoostingRegressor

gb_model = GradientBoostingRegressor(random_state=42)

gb_model.fit(X_train, y_train)

y_pred_gb = gb_model.predict(X_test)

gb_mae = mean_absolute_error(y_test, y_pred_gb)
gb_rmse = np.sqrt(mean_squared_error(y_test, y_pred_gb))
gb_r2 = r2_score(y_test, y_pred_gb)

print("Gradient Boosting Performance")
print(f"MAE:  {gb_mae:.2f}")
print(f"RMSE: {gb_rmse:.2f}")
print(f"R²:   {gb_r2:.3f}")


In [None]:
advanced_comparison = pd.DataFrame({
    "Model": ["Random Forest", "Gradient Boosting"],
    "MAE": [rf_mae, gb_mae],
    "RMSE": [rf_rmse, gb_rmse],
    "R2": [rf_r2, gb_r2]
})

advanced_comparison


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}

grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)
print("Best CV R² Score:", grid_search.best_score_)


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

param_grid = {
    "n_estimators": [100, 200],
    "max_depth": [None, 10, 20],
    "min_samples_split": [2, 5]
}

grid_search = GridSearchCV(
    RandomForestRegressor(random_state=42),
    param_grid,
    cv=5,
    scoring="r2",
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_


In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import numpy as np

# Ensure best_rf exists
if 'best_rf' not in globals():
    best_rf = rf_model

# Generate predictions
y_pred_best = best_rf.predict(X_test)

# Evaluate tuned model
best_mae = mean_absolute_error(y_test, y_pred_best)
best_rmse = np.sqrt(mean_squared_error(y_test, y_pred_best))
best_r2 = r2_score(y_test, y_pred_best)

best_r2




In [None]:
importances = pd.Series(
    best_rf.feature_importances_,
    index=X.columns
).sort_values(ascending=False)

importances.head(10)


In [None]:
importances.head(10).plot(kind="barh", figsize=(7,5))
plt.title("Top 10 Most Important Features")
plt.gca().invert_yaxis()
plt.show()


In [None]:
plt.figure(figsize=(6,5))
plt.scatter(y_test, y_pred_best, alpha=0.7)
plt.plot([0, 20], [0, 20])
plt.xlabel("Actual G3")
plt.ylabel("Predicted G3")
plt.title("Final Model: Actual vs Predicted")
plt.show()


In [None]:
final_summary = pd.DataFrame({
    "Model": [
        "Linear Regression",
        "Decision Tree",
        "Random Forest",
        "Gradient Boosting",
        "Tuned Random Forest"
    ],
    "R2": [
        lr_r2,
        dt_r2,
        rf_r2,
        gb_r2,
        best_r2
    ]
})

final_summary
