In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error, r2_score

# 1. Load Dataset
def load_data(file_path):
    """
    Load the dataset from a CSV file.
    Args:
        file_path (str): Path to the dataset.
    Returns:
        DataFrame: Loaded dataset.
    """
    return pd.read_csv(file_path)

# 2. Preprocessing Functions
def preprocess_data(df, categorical_columns, numerical_columns, target_column):
    """
    Preprocess the dataset by encoding categorical columns and scaling numerical columns.
    Args:
        df (DataFrame): Input dataset.
        categorical_columns (list): List of categorical columns.
        numerical_columns (list): List of numerical columns to scale.
        target_column (str): Target column for prediction.
    Returns:
        np.ndarray: Feature matrix X.
        np.ndarray: Target vector y.
    """
    # Encode categorical columns
    for col in categorical_columns:
        df[col] = df[col].astype('category').cat.codes

    # Scale numerical columns
    for col in numerical_columns:
        df[col] = (df[col] - df[col].mean()) / df[col].std()

    # Split features and target
    X = df.drop(target_column, axis=1).values
    y = df[target_column].values

    return X, y

# 3. Add Intercept to Feature Matrix
def add_intercept(X):
    """
    Add a column of ones to the feature matrix for the intercept.
    Args:
        X (np.ndarray): Original feature matrix.
    Returns:
        np.ndarray: Feature matrix with intercept column.
    """
    intercept = np.ones((X.shape[0], 1))
    return np.hstack((intercept, X))

# 4. Train Model Using Normal Equation
def train_linear_regression(X, y):
    """
    Train a linear regression model using the normal equation.
    Args:
        X (np.ndarray): Feature matrix with intercept.
        y (np.ndarray): Target vector.
    Returns:
        np.ndarray: Coefficients (parameters) of the regression model.
    """
    # Compute coefficients using the normal equation
    beta = np.linalg.inv(X.T @ X) @ X.T @ y
    return beta

# 5. Predict Using the Model
def predict(X, beta):
    """
    Predict the target values using the linear regression model.
    Args:
        X (np.ndarray): Feature matrix with intercept.
        beta (np.ndarray): Coefficients of the regression model.
    Returns:
        np.ndarray: Predicted target values.
    """
    return X @ beta

# 6. Evaluate the Model
def evaluate_model(y_true, y_pred):
    """
    Evaluate the regression model using RMSE and R2 metrics.
    Args:
        y_true (np.ndarray): True target values.
        y_pred (np.ndarray): Predicted target values.
    """
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
    print(f"R-squared (R2): {r2:.4f}")

# 7. Main Function
def main():
    # File path to your dataset
    file_path = "./data/combined_output1.csv"

    # Load and preprocess the dataset
    df = load_data(file_path)
    categorical_columns = ["state", "state_name", "disease"]
    numerical_columns = ["incidence_per_capita"]
    target_column = "cases"

    X, y = preprocess_data(df, categorical_columns, numerical_columns, target_column)

    # Add intercept to the feature matrix
    X = add_intercept(X)

    # Split the dataset into training and testing sets
    split_idx = int(0.8 * len(X))
    X_train, X_test = X[:split_idx], X[split_idx:]
    y_train, y_test = y[:split_idx], y[split_idx:]

    # Train the model
    beta = train_linear_regression(X_train, y_train)
    print("Trained Coefficients (Beta):", beta)

    # Make predictions
    y_pred = predict(X_test, beta)

    # Evaluate the model
    evaluate_model(y_test, y_pred)

# Run the pipeline
if __name__ == "__main__":
    main()


Trained Coefficients (Beta): [-5.92798538e+01  8.12870957e-04 -4.13705840e+00  4.15185161e+00
 -2.58259761e+00  2.28732093e+02]
Root Mean Squared Error (RMSE): 50.9378
R-squared (R2): -0.0927


In [2]:
import pandas as pd
import numpy as np

# 1. Load Dataset
def load_data(file_path):
    """
    Load the dataset from a CSV file.
    Args:
        file_path (str): Path to the dataset.
    Returns:
        DataFrame: Loaded dataset.
    """
    return pd.read_csv(file_path)

# 2. Custom Data Preprocessing
def custom_label_encoder(df, columns):
    """
    Encode categorical columns to integers manually.
    Args:
        df (DataFrame): Input dataset.
        columns (list): List of categorical columns to encode.
    Returns:
        DataFrame: Updated dataset with encoded columns.
        dict: Dictionary of label encoders used.
    """
    label_encoders = {}
    for col in columns:
        unique_values = df[col].unique()
        label_map = {value: idx for idx, value in enumerate(unique_values)}
        df[col] = df[col].map(label_map)
        label_encoders[col] = label_map
    return df, label_encoders

def custom_standard_scaler(df, columns):
    """
    Scale numerical columns manually (zero mean, unit variance).
    Args:
        df (DataFrame): Input dataset.
        columns (list): List of numerical columns to scale.
    Returns:
        DataFrame: Dataset with scaled numerical columns.
    """
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        df[col] = (df[col] - mean) / std
    return df

# 3. Manual Train-Test Split
def custom_train_test_split(df, target_column, test_size=0.2):
    """
    Split dataset into training and testing sets manually.
    Args:
        df (DataFrame): Input dataset.
        target_column (str): Target column for prediction.
        test_size (float): Proportion of test data.
    Returns:
        Tuple: Train-test split (X_train, X_test, y_train, y_test).
    """
    shuffled_df = df.sample(frac=1, random_state=42).reset_index(drop=True)
    split_idx = int(len(shuffled_df) * (1 - test_size))
    train_df = shuffled_df[:split_idx]
    test_df = shuffled_df[split_idx:]
    X_train = train_df.drop(target_column, axis=1).values
    y_train = train_df[target_column].values
    X_test = test_df.drop(target_column, axis=1).values
    y_test = test_df[target_column].values
    return X_train, X_test, y_train, y_test

# 4. Manual Linear Regression using Normal Equation
def train_linear_regression(X_train, y_train, alpha=0.01):
    """
    Train a Linear Regression model using Normal Equation with regularization.
    Args:
        X_train (ndarray): Training features.
        y_train (ndarray): Training target.
        alpha (float): Regularization parameter.
    Returns:
        ndarray: Weights (coefficients) of the linear regression model.
    """
    X_b = np.c_[np.ones((X_train.shape[0], 1)), X_train]
    identity = np.eye(X_b.shape[1])
    identity[0, 0] = 0  # Don't regularize the bias term
    weights = np.linalg.inv(X_b.T.dot(X_b) + alpha * identity).dot(X_b.T).dot(y_train)
    return weights


def predict(X, weights):
    """
    Predict using the trained Linear Regression model.
    Args:
        X (ndarray): Features.
        weights (ndarray): Model weights.
    Returns:
        ndarray: Predicted values.
    """
    # Add bias term (column of ones) to X
    X_b = np.c_[np.ones((X.shape[0], 1)), X]
    return X_b.dot(weights)

# 5. Evaluate Model
def evaluate_model(y_true, y_pred):
    """
    Evaluate the performance of the model.
    Args:
        y_true (ndarray): True target values.
        y_pred (ndarray): Predicted target values.
    Returns:
        dict: Dictionary containing evaluation metrics (MAE, MSE, RMSE).
    """
    mae = np.mean(np.abs(y_true - y_pred))
    mse = np.mean((y_true - y_pred) ** 2)
    rmse = np.sqrt(mse)
    r2 = 1 - (np.sum((y_true - y_pred) ** 2) / np.sum((y_true - np.mean(y_true)) ** 2))
    return {"MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2}

# 6. Main Function
def main():
    """
    Main pipeline for disease outbreak prediction using manual Linear Regression.
    """
    # File path to your dataset
    file_path = "./data/combined_output1.csv"
    
    # Load and preprocess the dataset
    df = load_data(file_path)

    # Step 1: Manually encode categorical columns
    categorical_columns = ["state", "state_name", "disease"]
    df, label_encoders = custom_label_encoder(df, categorical_columns)

    # Step 2: Manually scale numerical features
    numerical_columns = ["cases", "incidence_per_capita"]
    df = custom_standard_scaler(df, numerical_columns)

    # Split data into training and testing sets
    target_column = "cases"  # Predicting number of cases
    X_train, X_test, y_train, y_test = custom_train_test_split(df, target_column)

    # Train Linear Regression model
    weights = train_linear_regression(X_train, y_train)
    print(f"Trained Weights: {weights}")

    # Predict on test data
    y_pred = predict(X_test, weights)

    # Evaluate the model
    metrics = evaluate_model(y_test, y_pred)
    print("\nModel Evaluation:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")

# Run the pipeline
if __name__ == "__main__":
    main()

Trained Weights: [-7.72715242e-01  4.16719330e-06 -2.03719073e-04 -2.03719008e-04
 -9.81744159e-03  8.23740675e-01]

Model Evaluation:
MAE: 0.2224
MSE: 0.3138
RMSE: 0.5601
R2: 0.7378
