<a href="https://colab.research.google.com/github/k-rajdatta13/Descon_SmartPath_WinterProject/blob/main/assignment2_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# SmartPath Assignment 2 - Q2
# Regression: Predict Final_Score

In [None]:
# import libraries
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error



In [None]:
# 1. Create a synthetic regression dataset
# - Hours_Studied (numeric)
# - Attendance_Percentage (numeric)
# - Final_Score (numeric)
np.random.seed(42)  # Fixed seed for reproducibility

n_samples = 150  # Number of students

# Generate Hours_Studied between 0 and 10 hours per day (or per week, depending on your design)
hours_studied = np.random.uniform(low=0, high=10, size=n_samples)

# Generate Attendance_Percentage between 50% and 100%
attendance_percentage = np.random.uniform(low=50, high=100, size=n_samples)

# Now create Final_Score using a linear relationship plus noise:
# Example formula:
# Final_Score = 3 * Hours_Studied + 0.4 * Attendance_Percentage + noise
noise = np.random.normal(loc=0, scale=5, size=n_samples)  # Random noise to make it realistic
final_score = 3 * hours_studied + 0.4 * attendance_percentage + noise

# Clip Final_Score to stay within 0 to 100
final_score = np.clip(final_score, 0, 100)

# Put into a DataFrame
df_reg = pd.DataFrame({
    'Hours_Studied': hours_studied,
    'Attendance_Percentage': attendance_percentage,
    'Final_Score': final_score
})


In [None]:
print("First 5 rows of the regression dataset:")
print(df_reg.head())

print("\nSummary statistics:")
print(df_reg.describe())

First 5 rows of the regression dataset:
   Hours_Studied  Attendance_Percentage  Final_Score
0       3.745401              95.413294    49.629380
1       9.507143              61.978095    50.054665
2       7.319939              57.244744    55.577436
3       5.986585              74.472638    50.918405
4       1.560186              99.282523    34.267855

Summary statistics:
       Hours_Studied  Attendance_Percentage  Final_Score
count     150.000000             150.000000   150.000000
mean        4.728871              75.876103    44.843980
std         2.965308              14.570590    11.884164
min         0.055221              50.253079    20.417439
25%         2.147813              62.373882    35.417795
50%         4.481112              77.800052    44.039116
75%         7.496618              87.891878    53.883530
max         9.868869              99.502693    70.937852


In [None]:
# 2. Train-test split
# X = input features (Hours_Studied, Attendance_Percentage)
X = df_reg[['Hours_Studied', 'Attendance_Percentage']]

# y = target variable (Final_Score)
y = df_reg['Final_Score']

# Split data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.2,
                                                    random_state=42)

In [None]:
# 3. Train linear regression model
lin_reg = LinearRegression()  # Create the model object

# Fit (train) the model on training data
lin_reg.fit(X_train, y_train)

In [None]:
# 4. Model coefficients and MSE on test set
# Intercept (bias term)
intercept = lin_reg.intercept_

# Coefficients for each feature in the order of X.columns
coefficients = lin_reg.coef_

print("\nLinear Regression Model:")
print(f"Intercept: {intercept:.3f}")
print(f"Coefficient for Hours_Studied: {coefficients[0]:.3f}")
print(f"Coefficient for Attendance_Percentage: {coefficients[1]:.3f}")

# Predict Final_Score for test data
y_pred = lin_reg.predict(X_test)

# Calculate Mean Squared Error (MSE) on test set
mse = mean_squared_error(y_test, y_pred)
print(f"\nMean Squared Error (MSE) on test set: {mse:.3f}")



Linear Regression Model:
Intercept: 2.321
Coefficient for Hours_Studied: 3.256
Coefficient for Attendance_Percentage: 0.361

Mean Squared Error (MSE) on test set: 25.175


In [None]:
#  5. Simple interpretations

# Model performs reasonably well for small datasets

# - The coefficient for Hours_Studied tells how much Final_Score is expected to increase
#   when Hours_Studied increases by 1 unit, keeping Attendance_Percentage constant.

# - The coefficient for Attendance_Percentage tells how much Final_Score changes
#   for a 1% change in attendance, keeping Hours_Studied constant.

# - A lower MSE means better performance. MSE here gives the average squared difference
#   between predicted and actual scores on the test data.

# For SmartPath:
# - This model can be used to estimate a student's Final_Score based on their hours studied
#   and attendance, helping the system identify at-risk students early.

# One way performance could improve:
# - Add more relevant features (e.g., quiz performance, difficulty level of courses).
# - Use more flexible models (like Random Forest Regressor) if relationships are not purely linear ie for polynomial relationships.
# - Collect more high-quality data to reduce noise and improve generalization.


In [None]:
# True Negatives (Top-Left): Students predicted to fail who actually failed.

# False Positives (Top-Right): Students predicted to pass who actually failed (Dangerous for SmartPath; these students need help but the system thinks they are fine).

# False Negatives (Bottom-Left): Students predicted to fail who actually passed.

# True Positives (Bottom-Right): Students predicted to pass who actually passed.

# SmartPath Application: By minimizing False Positives, SmartPath ensures struggling students get the recommendations they need.



# Data Collection: Defining the student features.


# Feature Engineering: Selecting Hours_Studied and Attendance as key predictors.


# Model Training: Using Supervised Learning (Logistic and Linear Regression).


# Model Evaluation: Using metrics like Accuracy and MSE to check reliability