In [10]:
# Loading required libraries and fixing the seed
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report

SEED = 1
np.random.seed(SEED)

In [11]:
# Loading dataset & preprocessing the data
train_data = pd.read_csv('train.csv')
labels_data = pd.read_csv('labels.csv')
test_data = pd.read_csv('test.csv')

# Removing useless features
X_train = train_data.drop(columns=['Unnamed: 0', 'Age_Group'])
y_train = labels_data['Diabetes_binary']
X_test = test_data.drop(columns=['Unnamed: 0', 'Age_Group'])

# Encoding non-numerical features
def feature_encoding(X):
    non_numerical_columns_names = X.select_dtypes(exclude=['number']).columns
    for column in non_numerical_columns_names:
        le = LabelEncoder()
        X[column] = le.fit_transform(X[column])
    return X

X_train = feature_encoding(X_train)
X_test = feature_encoding(X_test)


# Normalizing features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
# Implementing functions used for the GridSearch

def custom_f1_threshold(estimator, X, y, threshold=0.5):
    """
    Compute the F1-score with non-default threshold values
    """
    probs = estimator.predict_proba(X)[:, 1]
    preds = (probs >= threshold).astype(int)
    return f1_score(y, preds)


def find_best_params_for_thresholds(model, X_train, y_train, params, thresholds):
    """
    Perform GridSearch with non-default threshold values
    """
    # Implementing Cross validation
    strat_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
    
    results = []

    for threshold in thresholds:
        print("\nThreshold:", threshold)
        # grid_search = GridSearchCV(model, param_grid=params, scoring=lambda est, X, y: custom_f1_threshold(est, X, y, threshold), cv=strat_kfold)
        grid_search = GridSearchCV(model, param_grid=params, scoring=lambda est, X, y: custom_f1_threshold(est, X, y, threshold), cv=strat_kfold)
        grid_search.fit(X_train, y_train)

        best_params = grid_search.best_params_
        best_score = grid_search.best_score_

        print("Best parameters:", best_params)
        print("Best F1-Score:", best_score)

        results.append({'threshold': threshold, 'best_params': best_params, 'best_score': best_score})

    return results

In [13]:
# Performing the GridSearch
param_grid = {
    'C': [0.005, 0.0095, 0.01, 0.02],
    'penalty': ['elasticnet'],
    'solver': ['saga'],
    'l1_ratio': [0.45, 0.5, 0.55],
    'class_weight': ['balanced'],
}

cls = LogisticRegression(random_state=SEED)
threshold = np.linspace(0.595, 0.61, 10)
threshold = [0.600204081632653]
results = find_best_params_for_thresholds(cls, X_train, y_train, param_grid, threshold)


Threshold: 0.600204081632653
Best parameters: {'C': 0.0095, 'class_weight': 'balanced', 'l1_ratio': 0.55, 'penalty': 'elasticnet', 'solver': 'saga'}
Best F1-Score: 0.4570503085453055


In [14]:
# Extracting the best combinaison
best_f1_score = 0
best_index = 0
for i in range(len(results)):
    best = results[i]['best_score']
    if best > best_f1_score:
        best_f1_score = best
        best_index = i

best_results = results[best_index]
print("\n\n ++++++++++++++++++++++++++++++++++++++++ \n Best Score is", best_f1_score)
print("Threshold is", best_results['threshold'])
print(best_results)

threshold = best_results['threshold']
best_params = best_results['best_params']



 ++++++++++++++++++++++++++++++++++++++++ 
 Best Score is 0.4570503085453055
Threshold is 0.600204081632653
{'threshold': 0.600204081632653, 'best_params': {'C': 0.0095, 'class_weight': 'balanced', 'l1_ratio': 0.55, 'penalty': 'elasticnet', 'solver': 'saga'}, 'best_score': 0.4570503085453055}


In [15]:
# Training the model with the best hyperparameters
model = LogisticRegression(**best_params, random_state=SEED)
model.fit(X_train, y_train)

# Predict probabilities with the custom threshold
y_train_probs = model.predict_proba(X_train)[:, 1]
y_train_pred = (y_train_probs >= threshold).astype(int)

In [16]:
# Evaluation on training set
f1 = f1_score(y_train, y_train_pred)
report = classification_report(y_train, y_train_pred, output_dict=True)

print(f"F1-Score on training set for threshold {threshold:.2f}: {f1:.2f}")
print("Classification Report:")
print(classification_report(y_train, y_train_pred))

F1-Score on training set for threshold 0.60: 0.46
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.81      0.87    174595
           1       0.35      0.64      0.46     28349

    accuracy                           0.79    202944
   macro avg       0.64      0.73      0.66    202944
weighted avg       0.85      0.79      0.81    202944


In [17]:
# Predicting probabilities on the test set
y_test_probs = model.predict_proba(X_test)[:, 1]
y_test_pred = (y_test_probs >= threshold).astype(int)

In [18]:
# Saving predictions in a .csv file for the Kaggle submission

y_test_pred = pd.DataFrame(y_test_pred, columns=['Diabetes_binary'], index=test_data['Unnamed: 0'])
y_test_pred.index.name = 'index'
#y_test_pred.to_csv("test_predictionsLogReg2.csv", index=True)

print(f"Predictions saved to 'test_predictionsLogReg2.csv'")

Predictions saved to 'test_predictionsLogReg2.csv'
