In [1]:
import pandas as pd
df = pd.read_csv('../data/ovarian_cancer_dataset.csv')

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

target_col='Cancer'
categorical_cols=['Histologic types of cancer', 'FIGO stage of tumor']
numeric_cols=['age', 'CA-125 (U/mL)', 'AST (U/L)', 'ALT (U/L)', 'Albumin (g/dL)',
              'Sodium (mmol/L)', 'Creatinine (mg/dL)', 'White blood cell (/uL)',
              'Lymph (/uL)', 'CA19-9 (U/mL)', 'Fibrinogen (mg/dL)']

validation_fraction=0.3
random_state=12345

# Split the data
X = df.drop(columns=[target_col])
y = df[target_col]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=validation_fraction,
                                                    random_state=random_state)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_cols),
        ('cat', OneHotEncoder(drop='first', sparse_output=False), categorical_cols)
    ])

In [None]:
from sklearn.ensemble import GradientBoostingClassifier #sklearn.ensemble vs sasviya.ml.tree
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV

gb = GradientBoostingClassifier(min_samples_leaf=5, 
                                n_estimators=100, 
                                max_depth=4, 
                                subsample=0.5)

# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', gb)
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
predictions = pipeline.predict(X_val)
from sklearn.metrics import f1_score
accuracy = accuracy_score(y_val,predictions)
accuracy

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))


print("Precision:",precision_score(y_val, predictions, average=None))
print("Recall:",recall_score(y_val, predictions, average=None))
print("F1 score:",f1_score(y_val, predictions, average=None))

print("\nClassification Report:",classification_report(y_val, predictions))

In [None]:
from sklearn.linear_model import LogisticRegression #sklearn vs sasviya.ml.linear_model
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFECV

logmodel = LogisticRegression(solver='lbfgs', tol=1e-4, max_iter=1000, verbose=True)
# Create pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', logmodel)
])

# Fit the pipeline
pipeline.fit(X_train, y_train)

In [None]:
from sklearn.metrics import accuracy_score, classification_report
predictions = pipeline.predict(X_val)
from sklearn.metrics import f1_score
accuracy = accuracy_score(y_val,predictions)
accuracy

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

print("Confusion Matrix:")
print(confusion_matrix(y_val, predictions))


print("Precision:",precision_score(y_val, predictions, average=None))
print("Recall:",recall_score(y_val, predictions, average=None))
print("F1 score:",f1_score(y_val, predictions, average=None))

print("\nClassification Report:",classification_report(y_val, predictions))