# Assignment 1: Linear Regression from Scratch
## Predicting Employee Attrition

**Objective:** Build a Linear Regression model from scratch (without using sklearn's LinearRegression) to predict employee attrition.

- **Target Variable:** Attrition (encoded as 0 = No, 1 = Yes)
- **Input Variables (8 features):** Age, DailyRate, DistanceFromHome, MonthlyIncome, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, JobSatisfaction

## 1. Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

print("Libraries imported successfully!")

## 2. Load the Dataset

In [None]:
# Load dataset
df = pd.read_csv('ml_dataset.csv')

print(f"Dataset Shape: {df.shape}")
print(f"\nFirst 5 rows:")
df.head()

In [None]:
# Basic info about the dataset
print("Dataset Info:")
print(f"Number of Rows: {df.shape[0]}")
print(f"Number of Columns: {df.shape[1]}")
print(f"\nColumn Names:\n{list(df.columns)}")
print(f"\nData Types:\n{df.dtypes}")

In [None]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

## 3. Data Preprocessing

In [None]:
# Encode the target variable: Attrition (Yes=1, No=0)
df['Attrition'] = df['Attrition'].map({'Yes': 1, 'No': 0})

print("Target Variable Distribution:")
print(df['Attrition'].value_counts())
print(f"\nAttrition Rate: {df['Attrition'].mean()*100:.2f}%")

In [None]:
# Select 8 input features and 1 output variable
input_features = [
    'Age',
    'DailyRate',
    'DistanceFromHome',
    'MonthlyIncome',
    'TotalWorkingYears',
    'YearsAtCompany',
    'YearsInCurrentRole',
    'JobSatisfaction'
]

target = 'Attrition'

print(f"Input Features ({len(input_features)}): {input_features}")
print(f"Target Variable: {target}")

In [None]:
# Create feature matrix X and target vector y
X = df[input_features].values
y = df[target].values

print(f"Feature matrix shape: {X.shape}")
print(f"Target vector shape: {y.shape}")
print(f"\nStatistical Summary of Features:")
df[input_features].describe()

In [None]:
# Feature Scaling (Min-Max Normalization) - from scratch
def min_max_normalize(X):
    """Normalize features using Min-Max scaling."""
    X_min = X.min(axis=0)
    X_max = X.max(axis=0)
    X_normalized = (X - X_min) / (X_max - X_min + 1e-8)  # small epsilon to avoid division by zero
    return X_normalized, X_min, X_max

X_normalized, X_min, X_max = min_max_normalize(X)
print("Feature scaling completed (Min-Max Normalization).")
print(f"Normalized feature matrix shape: {X_normalized.shape}")
print(f"\nMin of normalized features: {X_normalized.min(axis=0)}")
print(f"Max of normalized features: {X_normalized.max(axis=0)}")

In [None]:
# Train-Test Split from scratch (80% train, 20% test)
def train_test_split_scratch(X, y, test_size=0.2, random_seed=42):
    """Split data into training and testing sets."""
    np.random.seed(random_seed)
    n_samples = X.shape[0]
    n_test = int(n_samples * test_size)
    
    # Shuffle indices
    indices = np.random.permutation(n_samples)
    
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    
    X_train = X[train_indices]
    X_test = X[test_indices]
    y_train = y[train_indices]
    y_test = y[test_indices]
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = train_test_split_scratch(X_normalized, y, test_size=0.2, random_seed=42)

print(f"Training set: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Testing set:  X_test={X_test.shape}, y_test={y_test.shape}")

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Correlation heatmap for selected features
plt.figure(figsize=(10, 8))
correlation_matrix = df[input_features + [target]].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, fmt='.2f', linewidths=0.5)
plt.title('Correlation Heatmap - Selected Features vs Attrition', fontsize=14)
plt.tight_layout()
plt.show()

In [None]:
# Distribution of target variable
fig, axes = plt.subplots(1, 2, figsize=(12, 5))

# Bar chart
attrition_counts = df['Attrition'].value_counts()
axes[0].bar(['No (0)', 'Yes (1)'], [attrition_counts[0], attrition_counts[1]], color=['#2ecc71', '#e74c3c'])
axes[0].set_title('Attrition Distribution', fontsize=14)
axes[0].set_ylabel('Count')
for i, v in enumerate([attrition_counts[0], attrition_counts[1]]):
    axes[0].text(i, v + 10, str(v), ha='center', fontweight='bold')

# Pie chart
axes[1].pie([attrition_counts[0], attrition_counts[1]], labels=['No', 'Yes'],
            autopct='%1.1f%%', colors=['#2ecc71', '#e74c3c'], startangle=90)
axes[1].set_title('Attrition Percentage', fontsize=14)

plt.tight_layout()
plt.show()

In [None]:
# Feature distributions
fig, axes = plt.subplots(2, 4, figsize=(16, 8))
axes = axes.ravel()

for i, feature in enumerate(input_features):
    axes[i].hist(df[feature], bins=20, color='#3498db', edgecolor='black', alpha=0.7)
    axes[i].set_title(feature, fontsize=11)
    axes[i].set_ylabel('Frequency')

plt.suptitle('Distribution of Input Features', fontsize=14, y=1.02)
plt.tight_layout()
plt.show()

## 5. Linear Regression from Scratch

### Model: $\hat{y} = X \cdot W + b$

### Cost Function: Mean Squared Error (MSE)
$$MSE = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2$$

### Gradient Descent Update Rules:
$$W = W - \alpha \cdot \frac{\partial MSE}{\partial W}$$
$$b = b - \alpha \cdot \frac{\partial MSE}{\partial b}$$

In [None]:
class LinearRegressionScratch:
    """
    Linear Regression implemented from scratch using Gradient Descent.
    """
    
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
        self.cost_history = []
    
    def _compute_cost(self, y_true, y_pred):
        """Compute Mean Squared Error."""
        n = len(y_true)
        cost = (1 / (2 * n)) * np.sum((y_true - y_pred) ** 2)
        return cost
    
    def fit(self, X, y):
        """
        Train the Linear Regression model using Gradient Descent.
        
        Parameters:
        -----------
        X : numpy array of shape (n_samples, n_features)
        y : numpy array of shape (n_samples,)
        """
        n_samples, n_features = X.shape
        
        # Initialize weights and bias to zeros
        self.weights = np.zeros(n_features)
        self.bias = 0
        self.cost_history = []
        
        # Gradient Descent
        for i in range(self.n_iterations):
            # Forward pass: compute predictions
            y_pred = np.dot(X, self.weights) + self.bias
            
            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_pred - y))  # gradient w.r.t weights
            db = (1 / n_samples) * np.sum(y_pred - y)          # gradient w.r.t bias
            
            # Update weights and bias
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
            
            # Record cost
            cost = self._compute_cost(y, y_pred)
            self.cost_history.append(cost)
            
            # Print progress every 100 iterations
            if (i + 1) % 100 == 0:
                print(f"Iteration {i+1}/{self.n_iterations} - Cost: {cost:.6f}")
        
        print(f"\nTraining Complete!")
        print(f"Final Cost: {self.cost_history[-1]:.6f}")
    
    def predict(self, X):
        """Make predictions."""
        return np.dot(X, self.weights) + self.bias
    
    def get_params(self):
        """Return model parameters."""
        return {
            'weights': self.weights,
            'bias': self.bias,
            'learning_rate': self.learning_rate,
            'n_iterations': self.n_iterations
        }

print("LinearRegressionScratch class defined successfully!")

## 6. Train the Model

In [None]:
# Create and train the model
model = LinearRegressionScratch(learning_rate=0.1, n_iterations=1000)
model.fit(X_train, y_train)

In [None]:
# Plot the cost function over iterations
plt.figure(figsize=(10, 6))
plt.plot(range(1, len(model.cost_history) + 1), model.cost_history, color='#e74c3c', linewidth=2)
plt.xlabel('Iteration', fontsize=12)
plt.ylabel('Cost (MSE)', fontsize=12)
plt.title('Cost Function Convergence During Training', fontsize=14)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

In [None]:
# Display model parameters
params = model.get_params()
print("Model Parameters:")
print(f"Bias (Intercept): {params['bias']:.6f}")
print(f"\nWeights (Coefficients):")
for feature, weight in zip(input_features, params['weights']):
    print(f"  {feature:25s}: {weight:+.6f}")

## 7. Model Evaluation

In [None]:
# Evaluation Metrics from scratch
def mean_squared_error(y_true, y_pred):
    """Calculate Mean Squared Error."""
    return np.mean((y_true - y_pred) ** 2)

def root_mean_squared_error(y_true, y_pred):
    """Calculate Root Mean Squared Error."""
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mean_absolute_error(y_true, y_pred):
    """Calculate Mean Absolute Error."""
    return np.mean(np.abs(y_true - y_pred))

def r_squared(y_true, y_pred):
    """Calculate R-squared (Coefficient of Determination)."""
    ss_res = np.sum((y_true - y_pred) ** 2)    # Residual Sum of Squares
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)  # Total Sum of Squares
    return 1 - (ss_res / ss_tot)

print("Evaluation metric functions defined!")

In [None]:
# Make predictions on training and testing sets
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Training Set Metrics
print("=" * 50)
print("       TRAINING SET PERFORMANCE")
print("=" * 50)
print(f"Mean Squared Error (MSE):     {mean_squared_error(y_train, y_train_pred):.6f}")
print(f"Root Mean Squared Error (RMSE): {root_mean_squared_error(y_train, y_train_pred):.6f}")
print(f"Mean Absolute Error (MAE):    {mean_absolute_error(y_train, y_train_pred):.6f}")
print(f"R-squared (R²):              {r_squared(y_train, y_train_pred):.6f}")

print()

# Testing Set Metrics
print("=" * 50)
print("       TESTING SET PERFORMANCE")
print("=" * 50)
print(f"Mean Squared Error (MSE):     {mean_squared_error(y_test, y_test_pred):.6f}")
print(f"Root Mean Squared Error (RMSE): {root_mean_squared_error(y_test, y_test_pred):.6f}")
print(f"Mean Absolute Error (MAE):    {mean_absolute_error(y_test, y_test_pred):.6f}")
print(f"R-squared (R²):              {r_squared(y_test, y_test_pred):.6f}")

## 8. Visualizations

In [None]:
# Actual vs Predicted Values
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Training set
axes[0].scatter(y_train, y_train_pred, alpha=0.5, color='#3498db', edgecolors='black', linewidth=0.5)
axes[0].plot([y_train.min(), y_train.max()], [y_train.min(), y_train.max()], 'r--', linewidth=2)
axes[0].set_xlabel('Actual Values', fontsize=12)
axes[0].set_ylabel('Predicted Values', fontsize=12)
axes[0].set_title(f'Training Set: Actual vs Predicted\nR² = {r_squared(y_train, y_train_pred):.4f}', fontsize=13)
axes[0].grid(True, alpha=0.3)

# Testing set
axes[1].scatter(y_test, y_test_pred, alpha=0.5, color='#e74c3c', edgecolors='black', linewidth=0.5)
axes[1].plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', linewidth=2)
axes[1].set_xlabel('Actual Values', fontsize=12)
axes[1].set_ylabel('Predicted Values', fontsize=12)
axes[1].set_title(f'Testing Set: Actual vs Predicted\nR² = {r_squared(y_test, y_test_pred):.4f}', fontsize=13)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Residual Analysis
residuals_test = y_test - y_test_pred

fig, axes = plt.subplots(1, 2, figsize=(14, 6))

# Residual plot
axes[0].scatter(y_test_pred, residuals_test, alpha=0.5, color='#9b59b6', edgecolors='black', linewidth=0.5)
axes[0].axhline(y=0, color='red', linestyle='--', linewidth=2)
axes[0].set_xlabel('Predicted Values', fontsize=12)
axes[0].set_ylabel('Residuals', fontsize=12)
axes[0].set_title('Residual Plot (Test Set)', fontsize=13)
axes[0].grid(True, alpha=0.3)

# Residual distribution
axes[1].hist(residuals_test, bins=30, color='#9b59b6', edgecolor='black', alpha=0.7)
axes[1].axvline(x=0, color='red', linestyle='--', linewidth=2)
axes[1].set_xlabel('Residuals', fontsize=12)
axes[1].set_ylabel('Frequency', fontsize=12)
axes[1].set_title('Distribution of Residuals (Test Set)', fontsize=13)
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Feature Importance (based on absolute weight values)
feature_importance = pd.DataFrame({
    'Feature': input_features,
    'Weight': model.weights,
    'Absolute Weight': np.abs(model.weights)
}).sort_values('Absolute Weight', ascending=True)

plt.figure(figsize=(10, 6))
colors = ['#e74c3c' if w < 0 else '#2ecc71' for w in feature_importance['Weight']]
plt.barh(feature_importance['Feature'], feature_importance['Absolute Weight'], color=colors, edgecolor='black')
plt.xlabel('Absolute Weight', fontsize=12)
plt.title('Feature Importance (Based on Model Weights)', fontsize=14)
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

print("\nFeature Importance Table:")
print(feature_importance.to_string(index=False))

## 9. Classification Analysis

Since Attrition is a binary variable (0 or 1), we can threshold the linear regression output at 0.5 to make binary predictions and evaluate classification performance.

In [None]:
# Convert continuous predictions to binary (threshold = 0.5)
y_test_binary = (y_test_pred >= 0.5).astype(int)
y_train_binary = (y_train_pred >= 0.5).astype(int)

# Accuracy from scratch
def accuracy_score(y_true, y_pred):
    """Calculate accuracy."""
    return np.mean(y_true == y_pred)

# Confusion Matrix from scratch
def confusion_matrix_scratch(y_true, y_pred):
    """Calculate confusion matrix."""
    tp = np.sum((y_true == 1) & (y_pred == 1))  # True Positive
    tn = np.sum((y_true == 0) & (y_pred == 0))  # True Negative
    fp = np.sum((y_true == 0) & (y_pred == 1))  # False Positive
    fn = np.sum((y_true == 1) & (y_pred == 0))  # False Negative
    return np.array([[tn, fp], [fn, tp]])

# Calculate metrics
train_accuracy = accuracy_score(y_train, y_train_binary)
test_accuracy = accuracy_score(y_test, y_test_binary)
cm = confusion_matrix_scratch(y_test, y_test_binary)

print("=" * 50)
print("   CLASSIFICATION PERFORMANCE (Threshold=0.5)")
print("=" * 50)
print(f"Training Accuracy: {train_accuracy*100:.2f}%")
print(f"Testing Accuracy:  {test_accuracy*100:.2f}%")
print(f"\nConfusion Matrix (Test Set):")
print(f"  TN={cm[0,0]:4d}  FP={cm[0,1]:4d}")
print(f"  FN={cm[1,0]:4d}  TP={cm[1,1]:4d}")

# Precision, Recall, F1
precision = cm[1,1] / (cm[1,1] + cm[0,1]) if (cm[1,1] + cm[0,1]) > 0 else 0
recall = cm[1,1] / (cm[1,1] + cm[1,0]) if (cm[1,1] + cm[1,0]) > 0 else 0
f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"\nPrecision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1-Score:  {f1:.4f}")

In [None]:
# Confusion Matrix Heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No (0)', 'Yes (1)'],
            yticklabels=['No (0)', 'Yes (1)'])
plt.xlabel('Predicted', fontsize=12)
plt.ylabel('Actual', fontsize=12)
plt.title('Confusion Matrix (Test Set)', fontsize=14)
plt.tight_layout()
plt.show()

## 10. Conclusion

### Summary:
- Built a **Linear Regression model from scratch** using Gradient Descent (no sklearn)
- **Target Variable:** Attrition (encoded: Yes=1, No=0)
- **8 Input Features:** Age, DailyRate, DistanceFromHome, MonthlyIncome, TotalWorkingYears, YearsAtCompany, YearsInCurrentRole, JobSatisfaction
- Applied **Min-Max Normalization** for feature scaling
- Split data into **80% training / 20% testing**
- Evaluated using **MSE, RMSE, MAE, R²** and classification metrics (accuracy, precision, recall, F1)

### Key Observations:
- Linear Regression is not ideal for binary classification (Attrition is Yes/No), but it provides useful insights into feature relationships
- The model weights indicate which features have positive or negative associations with attrition
- For better classification performance, Logistic Regression or other classifiers would be more appropriate

In [None]:
print("\n" + "="*60)
print("   Assignment 1 - Linear Regression from Scratch")
print("   Status: COMPLETE")
print("="*60)