In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import root_mean_squared_error
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split
import random

In [3]:
class GradientDescentRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, lr=0.01, epochs=1000, lambda_reg=0, lasso=False):
        self.lr = lr
        self.epochs = epochs
        self.lambda_reg = lambda_reg
        self.lasso = lasso
        self.coef_ = None
        self.intercept_ = 0

    def fit(self, X, y, c):
        n_samples, n_features = X.shape
        self.coef_ = np.zeros(n_features)
        self.intercept_ = 0

        for epoch in range(self.epochs):
            # Predictions
            y_hat = np.dot(X, self.coef_) + self.intercept_

            # Gradient
            error = y - y_hat
            grad = -2 * ((1 - c) * error + c * np.maximum(0, error))
            
            # Weights/Bias
            coef_grad = np.dot(X.T, grad) / n_samples
            intercept_grad = np.sum(grad) / n_samples

            # Lasso / Ridge
            if self.lasso:
                coef_grad += self.lambda_reg * np.sign(self.coef_)
            else:  # Ridge
                coef_grad += self.lambda_reg * self.coef_

            # Update weights
            self.coef_ -= self.lr * coef_grad
            self.intercept_ -= self.lr * intercept_grad

            # Compute loss every 100 epochs for tracking
            #if epoch % 100 == 0:
                #loss = cMSE(y_hat, y, c)
                #print(f"Epoch {epoch}: cMSE Loss = {loss:.4f}")

        return self
    
    def predict(self, X):
        return np.dot(X, self.coef_) + self.intercept_

In [4]:
def cMSE(y_hat, y, c):
    err = y - y_hat
    loss = (1 - c) * err**2 + c * np.maximum(0, err)**2
    return np.sum(loss) / len(y)

In [5]:
def process_data():
    train_path = './Datasets/train_data.csv'
    test_path = "./Datasets/test_data.csv"
    
    df_train = pd.read_csv(train_path)
    df_test = pd.read_csv(test_path)
    
    df_test_filled = df_test.fillna(0.0)
    df_test_filled.drop('Id', axis=1, inplace=True)
    
    df_uncensored = df_train[df_train['Censored'] == 0]
    df_train_clean = df_uncensored.dropna(subset=['SurvivalTime']).copy()
    #df_train_clean = df_train_clean.dropna()
    
    df_train_clean.loc[:, 'GeneticRisk'] = df_train_clean['GeneticRisk'].fillna(df_train_clean['GeneticRisk'].median())
    df_train_clean.loc[:, 'ComorbidityIndex'] = df_train_clean['ComorbidityIndex'].fillna(df_train_clean['ComorbidityIndex'].median())
    df_train_clean.loc[:, 'TreatmentResponse'] = df_train_clean['TreatmentResponse'].fillna(df_train_clean['TreatmentResponse'].median())
    
    df_test_filled.loc[:, 'GeneticRisk'] = df_test_filled['GeneticRisk'].fillna(df_test_filled['GeneticRisk'].median())
    df_test_filled.loc[:, 'ComorbidityIndex'] = df_test_filled['ComorbidityIndex'].fillna(df_test_filled['ComorbidityIndex'].median())
    df_test_filled.loc[:, 'TreatmentResponse'] = df_test_filled['TreatmentResponse'].fillna(df_test_filled['TreatmentResponse'].median())
    
    X = df_train_clean.drop(columns=['SurvivalTime', 'Censored', 'Gender', 'GeneticRisk', 'Id'])
    
    df_test_filled = df_test_filled.drop(columns=['Gender', 'GeneticRisk'])
    y = df_train_clean['SurvivalTime']
    c = df_train_clean['Censored'].values
    
    # Split the data into train and test sets --------> FEATURES, TARGETS
    X_train, X_test, y_train, y_test, c_train, c_test = train_test_split(X, y, c, test_size=0.2, random_state= random.randint(1, 100))
    return X_train, X_test, y_train, y_test, c_train, c_test, df_test_filled

In [6]:
# Create a pipeline with StandardScaler and LinearRegression
def create_pipeline(lambda_value, is_ridge):    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', GradientDescentRegressor(lr=0.01, epochs=1000, lambda_reg=lambda_value , lasso=is_ridge))  # Ridge regularization
    ])
    
    return pipeline

In [7]:
def cross_val_cMSE(pipeline, X, y, c, cv=5):
    scores = []
    fold_size = len(X) // cv

    for i in range(cv):
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]
        c_val = c[i * fold_size:(i + 1) * fold_size]

        X_train = np.concatenate((X[:i * fold_size], X[(i + 1) * fold_size:]))
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]))
        c_train = np.concatenate((c[:i * fold_size], c[(i + 1) * fold_size:]))

        pipeline.fit(X_train, y_train, regressor__c=c_train)
        y_val_pred = pipeline.predict(X_val)
        scores.append(cMSE(y_val_pred, y_val, c_val))

    return scores

In [8]:
def train_and_predict(lambda_value, is_ridge):

    #Fazer um split novo
    X_train, X_test, y_train, y_test, c_train, c_test, df_test_filled = process_data()
    
    #Fazer um pipeline novo
    pipeline = create_pipeline(lambda_value, is_ridge)
    
    cv_scores = cross_val_cMSE(pipeline, X_train.to_numpy(), y_train.to_numpy(), c_train, cv=5)
    cv_score = np.sqrt(np.mean(cv_scores))
    
    # Treinar o modelo novo
    pipeline.fit(X_train, y_train, regressor__c=c_train)
    
    #Fazer predicts com o modelo novo
    y_pred = pipeline.predict(X_test)
    y_submission = pipeline.predict(df_test_filled)
    
    error = root_mean_squared_error(y_pred, y_test)    
    test_cMSE = cMSE(y_pred, y_test.values, c_test)
    
    return cv_score, error, test_cMSE, y_submission


In [10]:
def run_iterations():
    iterations = 100
    learning_rate = 0.01
    lambda_values = [1, 0.1, 0.01]
    is_ridge_values = [False]
    results = []
    counter = 0
    min_test_cMSE = 1000000
    
    for is_ridge in is_ridge_values:
        for lambda_value in lambda_values:
            for i in range(iterations):
                cv_score, error, test_cMSE, y_submission = train_and_predict(lambda_value, is_ridge)
                
                if test_cMSE < min_test_cMSE:
                    min_test_cMSE = test_cMSE
                    y_submission_df = pd.DataFrame(y_submission, columns=['Target'])
                    y_submission_df['id'] = range(0, len(y_submission_df))
                    y_submission_df = y_submission_df[['id'] + [col for col in y_submission_df.columns if col != 'id']]
                    y_submission_df.to_csv('cMSE-baseline-submission-00.csv', index=False)   
                
                regressor_type = "Ridge" if is_ridge else "Lasso"
    
                result = (
                    f"Cross-validated RMSE: {cv_score:.4f}\n"
                    f"Mean Squared Error (MSE): {error:.6f}\n"
                    f"Test cMSE Loss: {test_cMSE:.4f}\n"
                    f"Learning Rate: {learning_rate} | Lambda: {lambda_value} | Regressor: {regressor_type}"
                )
    
                results.append((test_cMSE, result, lambda_value, regressor_type, y_submission))
                counter += 1
    
    print(f"Total iterations: {counter} \n")
    
    best_results = find_best_results(results)
    
    print("-----------------------")
    print("RIDGE")
    print("\n-- Lambda = 1 --")
    print(best_results["ridge_lambda_1"]['result'])
    print("\n-- Lambda = 0.1 --")
    print(best_results["ridge_lambda_01"]['result'])
    print("\n-- Lambda = 0.01 --")
    print(best_results["ridge_lambda_001"]['result'])
    
    print("\n-----------------------")
    print("LASSO")
    print("\n-- Lambda = 1 --")
    print(best_results["lasso_lambda_1"]['result'])
    print("\n-- Lambda = 0.1 --")
    print(best_results["lasso_lambda_01"]['result'])
    print("\n-- Lambda = 0.01 --")
    print(best_results["lasso_lambda_001"]['result'])
    
    for key, value in best_results.items():
        if value["predictions"] is not None:
            predictions_df = pd.DataFrame(value["predictions"], columns=['Target'])
            predictions_df['id'] = range(len(predictions_df))
            predictions_df = predictions_df[['id'] + [col for col in predictions_df.columns if col != 'id']]
            predictions_df.to_csv(f'{key.replace(" ", "_")}_submission.csv', index=False)
    
run_iterations()     


Total iterations: 300 

-----------------------
RIDGE

-- Lambda = 1 --
None

-- Lambda = 0.1 --
None

-- Lambda = 0.01 --
None

-----------------------
LASSO

-- Lambda = 1 --
Cross-validated RMSE: 2.0224
Mean Squared Error (MSE): 1.201050
Test cMSE Loss: 1.4425
Learning Rate: 0.01 | Lambda: 1 | Regressor: Lasso

-- Lambda = 0.1 --
Cross-validated RMSE: 2.0193
Mean Squared Error (MSE): 1.236705
Test cMSE Loss: 1.5294
Learning Rate: 0.01 | Lambda: 0.1 | Regressor: Lasso

-- Lambda = 0.01 --
Cross-validated RMSE: 2.0179
Mean Squared Error (MSE): 1.122289
Test cMSE Loss: 1.2595
Learning Rate: 0.01 | Lambda: 0.01 | Regressor: Lasso


In [9]:
def find_best_results(results):
    best_ridge_lambda_1 = None
    best_ridge_lambda_01 = None
    best_ridge_lambda_001 = None
    best_lasso_lambda_1 = None
    best_lasso_lambda_01 = None
    best_lasso_lambda_001 = None

    for test_cMSE, result, lambda_value, regressor_type, predictions in results:
        if regressor_type == "Ridge":
            if lambda_value == 1:
                if best_ridge_lambda_1 is None or test_cMSE < best_ridge_lambda_1[0]:
                    best_ridge_lambda_1 = (test_cMSE, result, predictions)
            elif lambda_value == 0.1:
                if best_ridge_lambda_01 is None or test_cMSE < best_ridge_lambda_01[0]:
                    best_ridge_lambda_01 = (test_cMSE, result, predictions)
            elif lambda_value == 0.01:
                if best_ridge_lambda_001 is None or test_cMSE < best_ridge_lambda_001[0]:
                    best_ridge_lambda_001 = (test_cMSE, result, predictions)
        elif regressor_type == "Lasso":
            if lambda_value == 1:
                if best_lasso_lambda_1 is None or test_cMSE < best_lasso_lambda_1[0]:
                    best_lasso_lambda_1 = (test_cMSE, result, predictions)
            elif lambda_value == 0.1:
                if best_lasso_lambda_01 is None or test_cMSE < best_lasso_lambda_01[0]:
                    best_lasso_lambda_01 = (test_cMSE, result, predictions)
            elif lambda_value == 0.01:
                if best_lasso_lambda_001 is None or test_cMSE < best_lasso_lambda_001[0]:
                    best_lasso_lambda_001 = (test_cMSE, result, predictions)

    return {
        "ridge_lambda_1": {"result": best_ridge_lambda_1[1] if best_ridge_lambda_1 else None,
                                       "predictions": best_ridge_lambda_1[2] if best_ridge_lambda_1 else None},
        "ridge_lambda_01": {"result": best_ridge_lambda_01[1] if best_ridge_lambda_01 else None,
                                         "predictions": best_ridge_lambda_01[2] if best_ridge_lambda_01 else None},
        "ridge_lambda_001": {"result": best_ridge_lambda_001[1] if best_ridge_lambda_001 else None,
                                          "predictions": best_ridge_lambda_001[2] if best_ridge_lambda_001 else None},
        "lasso_lambda_1": {"result": best_lasso_lambda_1[1] if best_lasso_lambda_1 else None,
                                       "predictions": best_lasso_lambda_1[2] if best_lasso_lambda_1 else None},
        "lasso_lambda_01": {"result": best_lasso_lambda_01[1] if best_lasso_lambda_01 else None,
                                         "predictions": best_lasso_lambda_01[2] if best_lasso_lambda_01 else None},
        "lasso_lambda_001": {"result": best_lasso_lambda_001[1] if best_lasso_lambda_001 else None,
                                          "predictions": best_lasso_lambda_001[2] if best_lasso_lambda_001 else None}
    }
