In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import joblib

In [11]:
# Load Data
data = pd.read_csv('dataset/dataset.csv')
data.fillna(method='ffill', inplace=True)

# Encode categorical features
label_encoders = {}
categorical_cols = ['gender', 'country', 'cancer_stage', 'family_history', 'smoking_status',
                    'hypertension', 'asthma', 'cirrhosis', 'other_cancer', 'treatment_type', 'survived']

for col in categorical_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col])
    label_encoders[col] = le

# Convert dates to numerical timestamps
data['diagnosis_date'] = pd.to_datetime(data['diagnosis_date']).astype(int) // 10**9
data['end_treatment_date'] = pd.to_datetime(data['end_treatment_date']).astype(int) // 10**9




In [12]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['age', 'bmi', 'cholesterol_level', 'diagnosis_date', 'end_treatment_date']
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])


In [13]:
X = data.drop(['id', 'survived'], axis=1)
y = data['survived']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [15]:
# Handle imbalance using SMOTE only
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

print("\n Class Distribution AFTER SMOTE:")
print(pd.Series(y_resampled).value_counts())



In [16]:
# Fine-Tune Logistic Regression with Grid Search
param_grid = {
    'max_iter': [1000, 2000, 3000],
    'solver': ['lbfgs', 'saga'],
    'C': [0.1, 1, 10],  # Regularization strength
}

grid_search = GridSearchCV(LogisticRegression(class_weight='balanced', random_state=42),param_grid, cv=3)
grid_search.fit(X_resampled, y_resampled)

best_model = grid_search.best_estimator_
print("\n Best Model Parameters:")
print(best_model)




In [17]:
# Predict on test set
y_pred = best_model.predict(X_test)

In [18]:
# Performance Report
print("\n Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))



In [19]:
# Save the model and encoders
joblib.dump(best_model, 'models/best_model.pkl')
joblib.dump(scaler, 'models/scaler.pkl')
joblib.dump(label_encoders, 'models/label_encoders.pkl')

