In [121]:
# Re-run necessary steps due to kernel reset

import pandas as pd
import numpy as np

# Re-create synthetic dataset
np.random.seed(42)
n_students = 500
data = {
    'num_S': np.random.randint(0, 10, n_students),
    'num_A': np.random.randint(0, 10, n_students),
    'num_B': np.random.randint(0, 10, n_students),
    'num_C': np.random.randint(0, 8, n_students),
    'num_D': np.random.randint(0, 5, n_students),
    'num_F': np.random.randint(0, 3, n_students),
    'study_hours_per_week': np.random.normal(12, 4, n_students).clip(2, 25),
    'participated_in_events': np.random.choice([0, 1], n_students),
    'project_count': np.random.randint(0, 5, n_students),
    'internship_experience': np.random.choice([0, 1], n_students, p=[0.6, 0.4]),
    'travel_time_minutes': np.random.randint(10, 120, n_students),
    'lives_in_pg_or_hostel': np.random.choice([0, 1], n_students),
    'previous_board_cgpa': np.random.normal(8, 0.8, n_students).clip(5.0, 10.0),
}

df = pd.DataFrame(data)

# Final CGPA calculation (mock formula)
df['final_cgpa'] = (
    0.9 * df['num_S'] +
    0.6 * df['num_A'] +
    0.5 * df['num_B'] +
    0.6 * df['num_C'] -
    0.9 * df['num_F'] +
    0.03 * df['project_count'] +
    0.1 * df['internship_experience'] +
    0.02 * df['study_hours_per_week'] +
    0.02 * df['participated_in_events'] -
    0.05 * df['travel_time_minutes'] +
    0.15 * df['previous_board_cgpa']
).clip(5,10)

# Save dataset to a CSV file in the Colab environment
df.to_csv("cgpa_prediction_dataset.csv", index=False)

print("Dataset created and saved as 'cgpa_prediction_dataset.csv'")
df.head()

Dataset created and saved as 'cgpa_prediction_dataset.csv'


Unnamed: 0,num_S,num_A,num_B,num_C,num_D,num_F,study_hours_per_week,participated_in_events,project_count,internship_experience,travel_time_minutes,lives_in_pg_or_hostel,previous_board_cgpa,final_cgpa
0,6,8,0,4,3,0,9.963219,1,4,0,88,1,7.46275,9.658677
1,3,0,7,5,4,2,15.110125,0,3,0,81,0,8.090358,5.0
2,7,0,3,3,3,1,17.503465,1,0,1,16,0,7.288441,9.463335
3,4,3,3,3,1,1,16.050184,1,2,1,72,0,7.737214,5.861586
4,6,8,4,7,1,0,4.255539,0,1,1,55,1,8.600357,10.0


In [122]:
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [123]:
# Load the dataset
df = pd.read_csv("cgpa_prediction_dataset.csv")
df.head()

Unnamed: 0,num_S,num_A,num_B,num_C,num_D,num_F,study_hours_per_week,participated_in_events,project_count,internship_experience,travel_time_minutes,lives_in_pg_or_hostel,previous_board_cgpa,final_cgpa
0,6,8,0,4,3,0,9.963219,1,4,0,88,1,7.46275,9.658677
1,3,0,7,5,4,2,15.110125,0,3,0,81,0,8.090358,5.0
2,7,0,3,3,3,1,17.503465,1,0,1,16,0,7.288441,9.463335
3,4,3,3,3,1,1,16.050184,1,2,1,72,0,7.737214,5.861586
4,6,8,4,7,1,0,4.255539,0,1,1,55,1,8.600357,10.0


In [124]:
# Define input features and target
X = df.drop(columns=["final_cgpa"])
y = df["final_cgpa"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}")
print(f"Shape of X_test: {X_test.shape}")

Shape of X_train: (400, 13)
Shape of X_test: (100, 13)


In [125]:
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Models have been trained.")

Models have been trained.


In [126]:
def evaluate_model(name, y_test, y_pred):
    print(f"--- {name} ---")
    print(f"MAE: {mean_absolute_error(y_test, y_pred):.4f}")
    print(f"RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}") # Fixed line
    print(f"R²: {r2_score(y_test, y_pred):.4f}")
    print()

evaluate_model("Linear Regression", y_test, y_pred_lr)
evaluate_model("Random Forest Regressor", y_test, y_pred_rf)

--- Linear Regression ---
MAE: 0.7903
RMSE: 0.9290
R²: 0.8030

--- Random Forest Regressor ---
MAE: 0.7425
RMSE: 0.9727
R²: 0.7840



In [127]:
# Prediction function using Random Forest (default)
def predict_cgpa(input_features, model=lr):
    # The input features must be provided as a list or array-like object
    # with the same order and number of features as the training data.
    # The `predict` method expects a 2D array, so we wrap the input.
    return model.predict([input_features])[0]

# Example usage: Predict CGPA for a hypothetical student
# The order of features must match the DataFrame columns:
# 'num_S', 'num_A', 'num_B', 'num_C', 'num_D', 'num_F', 'study_hours_per_week',
# 'participated_in_events', 'project_count', 'internship_experience',
# 'travel_time_minutes', 'lives_in_pg_or_hostel', 'previous_board_cgpa'
sample_student_features = [
    5, 3, 7, 8, 8, 2,  # Grades (S, A, B, C, D, F)
    15,                 # Study hours per week
    1,                  # Participated in events
    3,                  # Project count
    1,                  # Internship experience
    45,                 # Travel time
    0,                  # Lives in PG/hostel
    2.5,                 # Previous board CGPA
]

predicted_cgpa = predict_cgpa(sample_student_features)
print(f"The predicted CGPA for the sample student is: {predicted_cgpa:.2f}")

The predicted CGPA for the sample student is: 8.76


