In [48]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.impute._iterative import IterativeImputer
from sklearn.manifold import Isomap
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.metrics import root_mean_squared_error
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import train_test_split
import random
import missingno as msno
import warnings

warnings.filterwarnings("ignore")

In [68]:
def cross_val_cMSE(pipeline, X, y, c, cv=5):
    scores = []
    fold_size = len(X) // cv

    for i in range(cv):
        X_val = X[i * fold_size:(i + 1) * fold_size]
        y_val = y[i * fold_size:(i + 1) * fold_size]
        c_val = c[i * fold_size:(i + 1) * fold_size]

        X_train = np.concatenate((X[:i * fold_size], X[(i + 1) * fold_size:]))
        y_train = np.concatenate((y[:i * fold_size], y[(i + 1) * fold_size:]))

        pipeline.fit(X_train, y_train)
        y_val_pred = pipeline.predict(X_val)
        scores.append(cMSE(y_val_pred, y_val, c_val))

    return scores

In [9]:
def cMSE(y_hat, y, c):
    err = y - y_hat
    loss = (1 - c) * err ** 2 + c * np.maximum(0, err) ** 2
    return np.sum(loss) / len(y)

In [10]:
class FrozenTransformer(BaseEstimator):
    def __init__(self, fitted_transformer):
        self.fitted_transformer = fitted_transformer

    def __getattr__(self, name):
        # `fitted_transformer`'s attributes are now accessible
        return getattr(self.fitted_transformer, name)

    def __sklearn_clone__(self):
        return self

    def fit(self, X, y=None):
        # Fitting does not change the state of the estimator
        return self

    def transform(self, X, y=None):
        # transform only transforms the data
        return self.fitted_transformer.transform(X)

    def fit_transform(self, X, y=None):
        # fit_transform only transforms the data
        return self.fitted_transformer.transform(X)

In [61]:
class GradientDescentRegressor(BaseEstimator, RegressorMixin):
    def __init__(self, lr=0.01, epochs=1000, lambda_reg=0, lasso=False):
        self.lr = lr
        self.epochs = epochs
        self.lambda_reg = lambda_reg
        self.lasso = lasso
        self.coef_ = None
        self.intercept_ = 0

    def fit(self, X, y, c):
        n_samples, n_features = X.shape
        self.coef_ = np.zeros(n_features)
        self.intercept_ = 0

        for epoch in range(self.epochs):
            # Predictions
            y_hat = np.dot(X, self.coef_) + self.intercept_

            # Gradient
            error = y - y_hat
            grad = -2 * ((1 - c) * error + c * np.maximum(0, error))
            
            # Weights/Bias
            coef_grad = np.dot(X.T, grad) / n_samples
            intercept_grad = np.sum(grad) / n_samples

            # Lasso / Ridge
            if self.lasso:
                coef_grad += self.lambda_reg * np.sign(self.coef_)
            else:  # Ridge
                coef_grad += self.lambda_reg * self.coef_

            # Update weights
            self.coef_ -= self.lr * coef_grad
            self.intercept_ -= self.lr * intercept_grad

            # Compute loss every 100 epochs for tracking
            #if epoch % 100 == 0:
                #loss = cMSE(y_hat, y, c)
                #print(f"Epoch {epoch}: cMSE Loss = {loss:.4f}")

        return self
    
    def predict(self, X):
        return np.dot(X, self.coef_) + self.intercept_

In [11]:
def apply_imputation(df, column, strategy='mean'):
    if strategy == 'mean':
        imputer = SimpleImputer(strategy='mean')
    elif strategy == 'median':
        imputer = SimpleImputer(strategy='median')
    elif strategy == 'most_frequent':
        imputer = SimpleImputer(strategy='most_frequent')
    elif strategy == 'knn':
        imputer = KNNImputer(n_neighbors=5)
    elif strategy == 'iterative':
        imputer = IterativeImputer(max_iter=10, random_state=60)

    #print(df[column])
    df[column] = imputer.fit_transform(df[[column]])
    #print(df[column])
    return df

### Data with Labels:

This is the subset of the dataset **where the target variable(s) are not missing**.
Neste caso, data with labels são todas as rows em que o SurvivalTime não é NaN.

### Data without Labels:

This is the subset of the dataset **where the target variable(s) are missing**.
Neste caso, data without labels são todas as rows em que o SurvivalTime é NaN.

In [44]:
def process_data_with_labels(strategy):
    train_path = './Datasets/train_data.csv'
    test_path = "./Datasets/test_data.csv"

    df_train_clean = pd.read_csv(train_path)
    df_test_filled = pd.read_csv(test_path)

    df_train_clean = df_train_clean.dropna(subset=['SurvivalTime']).copy()
    #print("Antes da imputation")
    #msno.bar(df_train_clean)

    #agora temos que aplicar a imputation às features com missing values
    df_train_clean = apply_imputation(df_train_clean, 'GeneticRisk', strategy)
    df_train_clean = apply_imputation(df_train_clean, 'ComorbidityIndex', strategy)
    df_train_clean = apply_imputation(df_train_clean, 'TreatmentResponse', strategy)

    df_test_filled = apply_imputation(df_test_filled, 'GeneticRisk', strategy)
    df_test_filled = apply_imputation(df_test_filled, 'ComorbidityIndex', strategy)
    df_test_filled = apply_imputation(df_test_filled, 'TreatmentResponse', strategy)

    #print("Depois da imputation")
    #msno.bar(df_train_clean)

    df_test_filled.drop('Id', axis=1, inplace=True)
    df_test_filled = df_test_filled.drop(columns=['Gender', 'GeneticRisk'])

    df_train_clean = df_train_clean[df_train_clean['Censored'] == 0]
    X = df_train_clean.drop(columns=['SurvivalTime', 'Censored', 'Gender', 'GeneticRisk', 'Id'])

    y = df_train_clean['SurvivalTime']
    c = df_train_clean['Censored'].values

    X_train, X_test, y_train, y_test, c_train, c_test = train_test_split(X, y, c, test_size=0.2,
                                                                         random_state=random.randint(0, 100))

    return X_train, X_test, y_train, y_test, c_train, c_test, df_test_filled

#process_data_with_labels()

In [192]:
#usar o df_train_clean com imputation (data with labels)
#usar o df_test_filled com imputation (data without labels)


def process_data_without_labels():
    train_path = './Datasets/train_data.csv'
    test_path = "./Datasets/test_data.csv"

    df_train_clean = pd.read_csv(train_path)
    df_test_filled = pd.read_csv(test_path)

    #aqui só queremos manter as rows em que SurvivalTime está missing (NaN)
    df_train_clean = df_train_clean[df_train_clean['SurvivalTime'].isna()]

    #agora temos que aplicar a imputation aos dados que estao missing
    strategy = 'knn'
    df_train_clean = apply_imputation(df_train_clean, 'GeneticRisk', strategy)
    df_train_clean = apply_imputation(df_train_clean, 'ComorbidityIndex', strategy)
    df_train_clean = apply_imputation(df_train_clean, 'TreatmentResponse', strategy)

    msno.bar(df_train_clean)

    df_test_filled = apply_imputation(df_test_filled, 'GeneticRisk', strategy)
    df_test_filled = apply_imputation(df_test_filled, 'ComorbidityIndex', strategy)
    df_test_filled = apply_imputation(df_test_filled, 'TreatmentResponse', strategy)

    df_test_filled.drop('Id', axis=1, inplace=True)
    df_test_filled = df_test_filled.drop(columns=['Gender', 'GeneticRisk'])

    df_train_clean = df_train_clean[df_train_clean['Censored'] == 0]
    X = df_train_clean.drop(columns=['SurvivalTime', 'Censored', 'Gender', 'GeneticRisk', 'Id'])

    y = df_train_clean['SurvivalTime']
    c = df_train_clean['Censored'].values

    # Split the data into train and test sets --------> FEATURES, TARGETS
    X_train, X_test, y_train, y_test, c_train, c_test = train_test_split(X, y, c, test_size=0.2,
                                                                         random_state=42)

    return X_train, X_test, y_train, y_test, c_train, c_test, df_test_filled


process_data_without_labels()

UnboundLocalError: cannot access local variable 'imputer' where it is not associated with a value

In [66]:
def create_pipeline(lambda_value, is_ridge):    
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', LinearRegression())
    ])
    
    return pipeline

In [69]:
#Then, use the imputed data with labels to train a Linear Regression model. Compare with the baseline and with the model trained in Task 3.1.

def run_model():
    model = create_pipeline(0.01, False)
    
    strategy = 'iterative'

    X_train, X_test, y_train, y_test, c_train, c_test, df_test_filled = process_data_with_labels(strategy)

    # Cross-validated RMSE
    cv_scores = cross_val_cMSE(model, X_train, y_train, c_train, cv=len(X_train) - 1)
    cv_score = np.sqrt(np.mean(cv_scores))

    # Baseline model training
    model.fit(X_train, y_train)

    # Predictions
    y_pred = model.predict(X_test)
    y_submission = model.predict(df_test_filled)

    # Preparing Submission File
    y_submission_df = pd.DataFrame(y_submission, columns=['Target'])
    y_submission_df['id'] = range(0, len(y_submission_df))
    y_submission_df = y_submission_df[['id'] + [col for col in y_submission_df.columns if col != 'id']]
    #y_submission_df.to_csv('cMSE-baseline-submission-00.csv', index=False)

    error = root_mean_squared_error(y_pred, y_test)
    test_cMSE = cMSE(y_pred, y_test.values, c_test)

    return cv_score, error, test_cMSE, y_submission_df

run_model()

(1.6690125084971619,
 2.280279729759547,
 5.199675645952273,
     id    Target
 0    0  6.349904
 1    1  3.173198
 2    2  4.895616
 3    3  6.151944
 4    4  3.160752
 ..  ..       ...
 95  95  4.911129
 96  96  4.030461
 97  97  3.844447
 98  98  4.920070
 99  99  6.349215
 
 [100 rows x 2 columns])

In [41]:
def find_best_imputation_strategy():
    strategies = ['mean', 'median', 'most_frequent', 'knn', 'iterative']
    iterations = 10
    results = []
    min_test_cMSE = 1000000
    counter = 0

    for strategy in strategies:
        for i in range(iterations):
            counter += 1
            
            model = Pipeline([
                ('scaler', StandardScaler()),
                ('regressor', LinearRegression())
            ])

            X_train, X_test, y_train, y_test, c_train, c_test, df_test_filled = process_data_with_labels(strategy)

            # Cross-validated RMSE
            cv_scores = cross_val_cMSE(model, X_train, y_train, c_train, cv=len(X_train) - 1)
            cv_score = np.sqrt(np.mean(cv_scores))

            # Baseline model training
            model.fit(X_train, y_train)

            # Predictions
            y_pred = model.predict(X_test)
            y_submission = model.predict(df_test_filled)

            # Preparing Submission File
            y_submission_df = pd.DataFrame(y_submission, columns=['Target'])
            y_submission_df['id'] = range(0, len(y_submission_df))
            y_submission_df = y_submission_df[['id'] + [col for col in y_submission_df.columns if col != 'id']]
            #y_submission_df.to_csv('cMSE-baseline-submission-00.csv', index=False)

            error = root_mean_squared_error(y_pred, y_test)
            test_cMSE = cMSE(y_pred, y_test.values, c_test)

            if test_cMSE < min_test_cMSE:
                min_test_cMSE = test_cMSE
                y_submission_df = pd.DataFrame(y_submission, columns=['Target'])
                y_submission_df['id'] = range(0, len(y_submission_df))
                y_submission_df = y_submission_df[['id'] + [col for col in y_submission_df.columns if col != 'id']]
                y_submission_df.to_csv('cMSE-task4-submission-00.csv', index=False)

            result = (
                f"Imputation Strategy: {strategy}\n"
                f"Cross-validated RMSE: {cv_score:.4f}\n"
                f"Mean Squared Error (MSE): {error:.6f}\n"
                f"Test cMSE Loss: {test_cMSE:.4f}\n"
            )

            results.append((test_cMSE, result, y_submission))

    best_result = find_best_result(results)
    print("Best result out of", counter, "iterations")
    print(best_result[1])
    
find_best_imputation_strategy()

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- GeneticRisk


In [72]:
def run_iterations():
    iterations = 10
    results = []
    min_test_cMSE = 1000000

    for i in range(iterations):
        cv_score, error, test_cMSE, y_submission = run_model()

        if test_cMSE < min_test_cMSE:
            min_test_cMSE = test_cMSE
            y_submission_df = pd.DataFrame(y_submission, columns=['Target'])
            y_submission_df['id'] = range(0, len(y_submission_df))
            y_submission_df = y_submission_df[['id'] + [col for col in y_submission_df.columns if col != 'id']]
            y_submission_df.to_csv('cMSE-task4-submission-00.csv', index=False)

        result = (
            f"Cross-validated RMSE: {cv_score:.4f}\n"
            f"Mean Squared Error (MSE): {error:.6f}\n"
            f"Test cMSE Loss: {test_cMSE:.4f}\n"
        )

        results.append((test_cMSE, result, y_submission))

    best_result = find_best_result(results)
    print(best_result[1])
    best_result[2].to_csv('cMSE-task4-submission-00.csv', index=False)

run_iterations()

Cross-validated RMSE: 1.9003
Mean Squared Error (MSE): 1.533002
Test cMSE Loss: 2.3501



In [18]:
def find_best_result(results):
    best_model_run = None

    for test_cMSE, result, predictions in results:
        if best_model_run is None or test_cMSE < best_model_run[0]:
            best_model_run = (test_cMSE, result, predictions)

    return best_model_run

In [None]:
# Melhor imputation da task 3.1: 
# Testar com labels na data e sem labels - TODO
# Treinar modelo de regressão linear com os dados imputados com labels - TODO
# Comparar com o baseline model e com o modelo treinado na task 3.1. - TODO