In [215]:
import warnings
warnings.filterwarnings("ignore")


In [217]:
import pandas as pd
import sqlite3

# Load the dataset
file_path = "C:/Users/SAI VIGNESH CHINTALA/Desktop/ML1/breast-cancer.csv"
breast_cancer_data = pd.read_csv(file_path)

# Create a new SQLite database (or connect to an existing one)
conn = sqlite3.connect('C:/Users/SAI VIGNESH CHINTALA/Desktop/ML1/breast_cancer.db')
cursor = conn.cursor()

# Define the SQL schema for Patient Information table
create_patient_info_table = """
CREATE TABLE IF NOT EXISTS PatientInformation (
    id INTEGER PRIMARY KEY,
    diagnosis TEXT
);
"""

# Define the SQL schema for Feature Statistics table
create_feature_stats_table = """
CREATE TABLE IF NOT EXISTS FeatureStatistics (
    id INTEGER,
    radius_mean REAL,
    texture_mean REAL,
    perimeter_mean REAL,
    area_mean REAL,
    smoothness_mean REAL,
    compactness_mean REAL,
    concavity_mean REAL,
    concave_points_mean REAL,
    symmetry_mean REAL,
    fractal_dimension_mean REAL,
    radius_se REAL,
    texture_se REAL,
    perimeter_se REAL,
    area_se REAL,
    smoothness_se REAL,
    compactness_se REAL,
    concavity_se REAL,
    concave_points_se REAL,
    symmetry_se REAL,
    fractal_dimension_se REAL,
    radius_worst REAL,
    texture_worst REAL,
    perimeter_worst REAL,
    area_worst REAL,
    smoothness_worst REAL,
    compactness_worst REAL,
    concavity_worst REAL,
    concave_points_worst REAL,
    symmetry_worst REAL,
    fractal_dimension_worst REAL,
    FOREIGN KEY (id) REFERENCES PatientInformation(id)
);
"""

# Execute the table creation queries
cursor.execute(create_patient_info_table)
cursor.execute(create_feature_stats_table)

# Commit changes
conn.commit()

# Insert data into the tables from the breast cancer dataset
patient_info_data = breast_cancer_data[['id', 'diagnosis']]
feature_stats_data = breast_cancer_data.drop(columns=['diagnosis'])

# Insert data into PatientInformation table
patient_info_data.to_sql('PatientInformation', conn, if_exists='replace', index=False)

# Insert data into FeatureStatistics table
feature_stats_data.to_sql('FeatureStatistics', conn, if_exists='replace', index=False)

# Add an index to improve query performance
cursor.execute("CREATE INDEX IF NOT EXISTS idx_patient_id ON FeatureStatistics(id);")
conn.commit()

# Close the database connection
conn.close()


In [219]:
import sqlite3
import pandas as pd

# Connect to the SQLite database
db_path = 'C:/Users/SAI VIGNESH CHINTALA/Desktop/ML1/breast_cancer.db'
conn = sqlite3.connect(db_path)

# SQL query to join PatientInformation and FeatureStatistics tables
query = """
SELECT 
    PatientInformation.id, 
    PatientInformation.diagnosis, 
    FeatureStatistics.radius_mean, 
    FeatureStatistics.texture_mean, 
    FeatureStatistics.perimeter_mean, 
    FeatureStatistics.area_mean, 
    FeatureStatistics.smoothness_mean
FROM 
    PatientInformation
JOIN 
    FeatureStatistics
ON 
    PatientInformation.id = FeatureStatistics.id
"""

# Execute the query and load the result into a Pandas DataFrame
joined_data = pd.read_sql_query(query, conn)

# Close the database connection
conn.close()

# Display the first few rows of the DataFrame
print(joined_data.head())


         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0    842302         M        17.99         10.38          122.80     1001.0   
1    842517         M        20.57         17.77          132.90     1326.0   
2  84300903         M        19.69         21.25          130.00     1203.0   
3  84348301         M        11.42         20.38           77.58      386.1   
4  84358402         M        20.29         14.34          135.10     1297.0   

   smoothness_mean  
0          0.11840  
1          0.08474  
2          0.10960  
3          0.14250  
4          0.10030  


In [221]:
import pandas as pd
import numpy as np

# Step 1: Rename Columns
joined_data.columns = joined_data.columns.str.lower().str.replace(" ", "_")

# Step 2: Remove Irrelevant Columns (if applicable)
# Assuming no irrelevant columns in the current joined_data

# Step 3: Handle Missing Data
# Check for missing data
missing_data = joined_data.isnull().sum()
print("Missing Data Per Column:")
print(missing_data)

# If there are missing values, drop rows with missing data
joined_data = joined_data.dropna()

# Step 4: Standardize Strings (if applicable)
if 'diagnosis' in joined_data.columns:
    joined_data['diagnosis'] = joined_data['diagnosis'].str.strip().str.upper()

# Step 5: Correct Column Data Types
# Ensure numeric columns are numeric
for col in joined_data.select_dtypes(include=['object']).columns:
    try:
        joined_data[col] = pd.to_numeric(joined_data[col])
    except ValueError:
        pass  # Keep as object if conversion fails

# Step 6: Identify and Remove Outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

for col in joined_data.select_dtypes(include=['float64', 'int64']).columns:
    joined_data = remove_outliers(joined_data, col)

# Step 7: Explore Data
print("Data Summary:")
print(joined_data.describe())

# Calculate the Correlation Matrix (Exclude Non-Numeric Columns)
numeric_columns = joined_data.select_dtypes(include=['float64', 'int64']).columns
correlation_matrix = joined_data[numeric_columns].corr()

print("Correlation Matrix:")
print(correlation_matrix)

# Step 8: Save the Cleaned Data
cleaned_file_path = "C:/Users/SAI VIGNESH CHINTALA/Desktop/ML1/cleaned_breast_cancer_data.csv"
joined_data.to_csv(cleaned_file_path, index=False)
print(f"Cleaned data saved to {cleaned_file_path}")


Missing Data Per Column:
id                 0
diagnosis          0
radius_mean        0
texture_mean       0
perimeter_mean     0
area_mean          0
smoothness_mean    0
dtype: int64
Data Summary:
                 id  radius_mean  texture_mean  perimeter_mean    area_mean  \
count  4.380000e+02   438.000000    438.000000      438.000000   438.000000   
mean   2.481861e+06    13.328801     18.714498       86.399018   569.692694   
std    3.288493e+06     2.651980      3.951228       18.133148   233.678794   
min    8.670000e+03     6.981000      9.710000       43.790000   143.500000   
25%    8.645432e+05    11.512500     15.870000       73.885000   403.850000   
50%    8.980115e+05    12.955000     18.325000       83.465000   515.200000   
75%    9.222968e+05    14.867500     21.365000       96.442500   680.050000   
max    9.113816e+06    19.790000     29.810000      132.400000  1192.000000   

       smoothness_mean  
count       438.000000  
mean          0.095059  
std           

In [223]:
import warnings
warnings.filterwarnings('ignore')

In [225]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

# Step 1: Map diagnosis to binary values
if 'diagnosis' in joined_data.columns:
    joined_data['diagnosis'] = joined_data['diagnosis'].map({'M': 1, 'B': 0})
else:
    raise ValueError("Column 'diagnosis' is missing in the DataFrame.")

# Debug: Check DataFrame shape and head
print(f"DataFrame shape after loading: {joined_data.shape}")
print(joined_data.head())

# Step 2: Handle missing or invalid data
joined_data = joined_data.dropna(subset=['diagnosis'])  # Drop rows where diagnosis is NaN
selected_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']
selected_features = [feature for feature in selected_features if feature in joined_data.columns]

# Debug: Check selected features
print(f"Selected features: {selected_features}")

# Fill missing values in selected features
joined_data[selected_features] = joined_data[selected_features].fillna(joined_data[selected_features].median())

# Debug: Check for missing values
print("Missing values:\n", joined_data[selected_features].isnull().sum())

# Step 3: Generate ydata-profiling report
if not joined_data.empty:
    profile = ProfileReport(joined_data, title="Breast Cancer Dataset Profiling Report", explorative=True)
    profile.to_file("breast_cancer_profile_report.html")
    print("Y-Profile Report saved as 'breast_cancer_profile_report.html'")
else:
    print("DataFrame is empty. Skipping profiling.")

# Step 4: Correlation Matrix
plt.figure(figsize=(12, 8))
correlation_matrix = joined_data.corr()

# Debug: Check correlation matrix
print("Correlation matrix:\n", correlation_matrix)

if not correlation_matrix.empty:
    sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
    plt.title("Feature Correlation Matrix")
    plt.show()
else:
    print("Correlation matrix is empty. Skipping heatmap.")

# Step 5: Boxplots for diagnosis vs features
for feature in selected_features:
    if feature in joined_data.columns:
        plt.figure(figsize=(6, 4))
        sns.boxplot(
            x=joined_data['diagnosis'], 
            y=joined_data[feature], 
            palette="Set2"
        )
        plt.title(f"Y-Profile for {feature} (Diagnosis vs {feature})")
        plt.xlabel("Diagnosis (0=Benign, 1=Malignant)")
        plt.ylabel(feature)
        plt.show()
    else:
        print(f"Feature {feature} is not in the DataFrame. Skipping plot.")


DataFrame shape after loading: (438, 7)
        id  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0   842302          1        17.99         10.38          122.80     1001.0   
5   843786          1        12.45         15.70           82.57      477.1   
6   844359          1        18.25         19.98          119.60     1040.0   
8   844981          1        13.00         21.82           87.50      519.8   
10  845636          1        16.02         23.24          102.70      797.8   

    smoothness_mean  
0           0.11840  
5           0.12780  
6           0.09463  
8           0.12730  
10          0.08206  
Selected features: ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']
Missing values:
 radius_mean        0
texture_mean       0
perimeter_mean     0
area_mean          0
smoothness_mean    0
dtype: int64


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Y-Profile Report saved as 'breast_cancer_profile_report.html'
Correlation matrix:
                        id  diagnosis  radius_mean  texture_mean  \
id               1.000000  -0.024431     0.080356      0.077484   
diagnosis       -0.024431   1.000000     0.700787      0.360001   
radius_mean      0.080356   0.700787     1.000000      0.251722   
texture_mean     0.077484   0.360001     0.251722      1.000000   
perimeter_mean   0.077166   0.719516     0.997126      0.258162   
area_mean        0.076154   0.720253     0.992648      0.263184   
smoothness_mean -0.054963   0.312833     0.075392     -0.062251   

                 perimeter_mean  area_mean  smoothness_mean  
id                     0.077166   0.076154        -0.054963  
diagnosis              0.719516   0.720253         0.312833  
radius_mean            0.997126   0.992648         0.075392  
texture_mean           0.258162   0.263184        -0.062251  
perimeter_mean         1.000000   0.990693         0.115987  
area_mea

In [209]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ydata_profiling import ProfileReport

# Assuming `joined_data` is already loaded

# Step 1: Map diagnosis to binary values
if 'diagnosis' in joined_data.columns:
    joined_data['diagnosis'] = joined_data['diagnosis'].map({'M': 1, 'B': 0})
else:
    raise ValueError("Column 'diagnosis' is missing in the DataFrame.")

# Step 2: Handle missing or invalid data
joined_data = joined_data.dropna(subset=['diagnosis'])  # Drop rows where diagnosis is NaN
selected_features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean']
selected_features = [feature for feature in selected_features if feature in joined_data.columns]

# Fill missing values in selected features
joined_data[selected_features] = joined_data[selected_features].fillna(joined_data[selected_features].median())

# Step 3: Generate ydata-profiling report
profile = ProfileReport(joined_data, title="Breast Cancer Dataset Profiling Report", explorative=True)
profile.to_file("breast_cancer_profile_report.html")
print("Y-Profile Report saved as 'breast_cancer_profile_report.html'")

# Step 4: Correlation Matrix
plt.figure(figsize=(12, 8))
correlation_matrix = joined_data.corr()
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm')
plt.title("Feature Correlation Matrix")
plt.show()

# Step 5: Boxplots for diagnosis vs features
for feature in selected_features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(
        x=joined_data['diagnosis'], 
        y=joined_data[feature], 
        palette="Set2"
    )
    plt.title(f"Y-Profile for {feature} (Diagnosis vs {feature})")
    plt.xlabel("Diagnosis (0=Benign, 1=Malignant)")
    plt.ylabel(feature)
    plt.show()


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

Y-Profile Report saved as 'breast_cancer_profile_report.html'


In [210]:
print(f"DataFrame Shape: {joined_data.shape}")
print(joined_data.head())


DataFrame Shape: (438, 7)
        id  diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  \
0   842302          1        17.99         10.38          122.80     1001.0   
5   843786          1        12.45         15.70           82.57      477.1   
6   844359          1        18.25         19.98          119.60     1040.0   
8   844981          1        13.00         21.82           87.50      519.8   
10  845636          1        16.02         23.24          102.70      797.8   

    smoothness_mean  
0           0.11840  
5           0.12780  
6           0.09463  
8           0.12730  
10          0.08206  


In [213]:
print(f"DataFrame Shape After Cleaning: {joined_data.shape}")


DataFrame Shape After Cleaning: (438, 7)


In [None]:
import os
import mlflow
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, make_scorer, classification_report

# Ensure MLFlow Tracking
MLFLOW_TRACKING_URI = "https://dagshub.com/saivignesh-03/Machinelearning.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'saivignesh-03'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '9c78979cb39e1c46900c7f95953a7fcb54a30dee'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("Machinelearning")

# Define preprocessing and model pipeline
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
categorical_features = X.select_dtypes(include=['object', 'category']).columns

# Numeric Transformer
log_transformer = FunctionTransformer(np.log1p, validate=True)
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # Handle missing numeric values
    ('scaler', StandardScaler()),
    ('minmax', MinMaxScaler()),
    ('log', log_transformer)
])

# Categorical Transformer
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),  # Handle missing categorical values
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ],
    remainder='passthrough'
)

# Logistic Regression Model
logistic = LogisticRegression(max_iter=1000, random_state=42)

# Combine preprocessing and model into a pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', logistic)
])

# Cross-validation setup
kf_3fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
kf_10fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Hyperparameter tuning
param_grid = {
    'classifier__C': [0.01, 0.1, 1.0, 10.0],
    'classifier__solver': ['liblinear', 'lbfgs']
}
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, scoring=make_scorer(f1_score), cv=kf_3fold, n_jobs=-1)

# Log results in MLFlow
with mlflow.start_run(run_name="Experiment #1: Logistic Regression with Preprocessing"):
    grid_search.fit(X, y)

    # Best model and parameters
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Cross-validation results
    f1_scores_3fold = cross_val_score(best_model, X, y, cv=kf_3fold, scoring=make_scorer(f1_score))
    f1_scores_10fold = cross_val_score(best_model, X, y, cv=kf_10fold, scoring=make_scorer(f1_score))

    y_pred = best_model.predict(X)
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    # Log parameters and metrics
    mlflow.log_params(best_params)
    mlflow.log_param("preprocessing", "StandardScaler + MinMaxScaler + LogTransformation + OneHotEncoding")
    mlflow.log_metric("f1_mean_3fold", np.mean(f1_scores_3fold))
    mlflow.log_metric("f1_mean_10fold", np.mean(f1_scores_10fold))
    mlflow.log_metric("true_positives", tp)
    mlflow.log_metric("true_negatives", tn)
    mlflow.log_metric("false_positives", fp)
    mlflow.log_metric("false_negatives", fn)

    # Log classification report
    classification_report_str = classification_report(y, y_pred)
    mlflow.log_text(classification_report_str, "classification_report.txt")

    # Log model
    mlflow.sklearn.log_model(best_model, artifact_path="logistic_regression_model", registered_model_name="LogisticRegression-Experiment-1")

# Print results
print("\nExperiment #1 Completed:")
print(f"Best Parameters: {best_params}")
print(f"3-Fold F1 Score: Mean={np.mean(f1_scores_3fold):.4f}")
print(f"10-Fold F1 Score: Mean={np.mean(f1_scores_10fold):.4f}")
print(f"Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")


In [None]:
import os
import mlflow
import mlflow.sklearn
import numpy as np
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, make_scorer, classification_report

# Set up MLFlow tracking
MLFLOW_TRACKING_URI = "https://dagshub.com/saivignesh-03/Machinelearning.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'saivignesh-03'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '9c78979cb39e1c46900c7f95953a7fcb54a30dee'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("Machinelearning")

# Assuming X and y are already loaded and preprocessed
# Ensure X (features) and y (target) are ready for model training

# Preprocessing for numeric features
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns
log_transformer = FunctionTransformer(np.log1p, validate=True)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('minmax', MinMaxScaler()),
    ('log', log_transformer)
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ],
    remainder='passthrough'
)

# List of classifiers to test
classifiers = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=42),
    "RidgeClassifier": RidgeClassifier(random_state=42),
    "RandomForestClassifier": RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    "XGBClassifier": XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric="logloss", random_state=42)
}

# Cross-validation setup
kf_10fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Iterate through classifiers and log results in MLFlow
for clf_name, clf in classifiers.items():
    with mlflow.start_run(run_name=f"Experiment #2: {clf_name}"):
        # Create pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', clf)
        ])

        # Perform 10-fold cross-validation
        f1_scores = cross_val_score(pipeline, X, y, cv=kf_10fold, scoring=make_scorer(f1_score))
        acc_scores = cross_val_score(pipeline, X, y, cv=kf_10fold, scoring=make_scorer(accuracy_score))

        # Train the model on the full training data
        pipeline.fit(X, y)
        y_pred = pipeline.predict(X)

        # Compute confusion matrix
        tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

        # Log parameters
        mlflow.log_param("classifier", clf_name)
        mlflow.log_param("preprocessing", "StandardScaler + MinMaxScaler + LogTransformation")

        # Log metrics
        mlflow.log_metric("f1_mean_10fold", np.mean(f1_scores))
        mlflow.log_metric("f1_std_10fold", np.std(f1_scores))
        mlflow.log_metric("accuracy_mean_10fold", np.mean(acc_scores))
        mlflow.log_metric("accuracy_std_10fold", np.std(acc_scores))
        mlflow.log_metric("f1_score_training", f1_score(y, y_pred))
        mlflow.log_metric("accuracy_training", accuracy_score(y, y_pred))
        mlflow.log_metric("true_positives", tp)
        mlflow.log_metric("true_negatives", tn)
        mlflow.log_metric("false_positives", fp)
        mlflow.log_metric("false_negatives", fn)

        # Log model to MLflow
        mlflow.sklearn.log_model(pipeline, artifact_path=f"{clf_name}_model", registered_model_name=f"{clf_name}-Experiment-2")

        # Print results
        print(f"\nExperiment #2 Completed for {clf_name}:")
        print(f"10-Fold F1 Score: Mean={np.mean(f1_scores):.4f}, Std={np.std(f1_scores):.4f}")
        print(f"10-Fold Accuracy Score: Mean={np.mean(acc_scores):.4f}, Std={np.std(acc_scores):.4f}")
        print(f"Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")
        print("\nClassification Report:")
        print(classification_report(y, y_pred))


In [None]:
import os
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, PolynomialFeatures
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, make_scorer, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import VarianceThreshold

# Set up MLFlow tracking
MLFLOW_TRACKING_URI = "https://dagshub.com/saivignesh-03/Machinelearning.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'saivignesh-03'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '9c78979cb39e1c46900c7f95953a7fcb54a30dee'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("Machinelearning")

# Assuming X and y are already loaded and preprocessed
# Feature Engineering: Generate interaction terms and polynomial features
numeric_features = X.select_dtypes(include=['float64', 'int64']).columns

# Preprocessing pipeline
log_transformer = FunctionTransformer(np.log1p, validate=True)
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('minmax', MinMaxScaler()),
    ('log', log_transformer),
    ('poly', PolynomialFeatures(degree=2, interaction_only=True, include_bias=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
    ],
    remainder='passthrough'
)

# Feature selection: Remove low-variance features
feature_selector = VarianceThreshold(threshold=0.01)

# Classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)

# Create pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('feature_selection', feature_selector),
    ('classifier', classifier)
])

# Cross-validation setup
kf_10fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation and log results in MLFlow
with mlflow.start_run(run_name="Experiment #3: Feature Engineering with RandomForest"):
    # Perform 10-fold cross-validation
    f1_scores = cross_val_score(pipeline, X, y, cv=kf_10fold, scoring=make_scorer(f1_score))
    acc_scores = cross_val_score(pipeline, X, y, cv=kf_10fold, scoring=make_scorer(accuracy_score))

    # Train the model on the entire training data
    pipeline.fit(X, y)
    y_pred = pipeline.predict(X)

    # Compute confusion matrix
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

    # Log parameters
    mlflow.log_param("preprocessing", "StandardScaler + MinMaxScaler + LogTransformation + PolynomialFeatures")
    mlflow.log_param("feature_selection", "VarianceThreshold")
    mlflow.log_param("classifier", "RandomForestClassifier")

    # Log metrics
    mlflow.log_metric("f1_mean_10fold", np.mean(f1_scores))
    mlflow.log_metric("f1_std_10fold", np.std(f1_scores))
    mlflow.log_metric("accuracy_mean_10fold", np.mean(acc_scores))
    mlflow.log_metric("accuracy_std_10fold", np.std(acc_scores))
    mlflow.log_metric("f1_score_training", f1_score(y, y_pred))
    mlflow.log_metric("accuracy_training", accuracy_score(y, y_pred))
    mlflow.log_metric("true_positives", tp)
    mlflow.log_metric("true_negatives", tn)
    mlflow.log_metric("false_positives", fp)
    mlflow.log_metric("false_negatives", fn)

    # Log model to MLFlow
    mlflow.sklearn.log_model(pipeline, artifact_path="feature_engineering_model", registered_model_name="FeatureEngineering-Experiment-3")

    # Print results
    print("\nExperiment #3 Completed:")
    print(f"10-Fold F1 Score: Mean={np.mean(f1_scores):.4f}, Std={np.std(f1_scores):.4f}")
    print(f"10-Fold Accuracy Score: Mean={np.mean(acc_scores):.4f}, Std={np.std(acc_scores):.4f}")
    print(f"Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")
    print("\nClassification Report:")
    print(classification_report(y, y_pred))


In [None]:
import os
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score, classification_report
import warnings

warnings.filterwarnings('ignore')

# Set up MLFlow tracking
MLFLOW_TRACKING_URI = "https://dagshub.com/saivignesh-03/Machinelearning.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'saivignesh-03'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '9c78979cb39e1c46900c7f95953a7fcb54a30dee'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("Machinelearning")

# Load the dataset
data = pd.read_csv('breast-cancer.csv')
X = data.drop(columns=['id', 'diagnosis'])
y = data['diagnosis'].map({'M': 1, 'B': 0})  # Convert diagnosis to binary

# Validate dataset before processing
if X.empty or y.empty:
    raise ValueError("Dataset is empty. Please check your data.")

# Step 1: Remove constant and quasi-constant features
variance_selector = VarianceThreshold(threshold=0.01)
X_var = pd.DataFrame(
    variance_selector.fit_transform(X),
    columns=X.columns[variance_selector.get_support()],
    index=X.index
)

# Step 2: Remove highly correlated features
correlation_matrix = X_var.corr().abs()
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(bool))
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
X_uncorr = X_var.drop(columns=to_drop)

# Step 3: Select features based on importance
rf_selector = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced')
rf_selector.fit(X_uncorr, y)

importance_df = pd.DataFrame({
    'feature': X_uncorr.columns,
    'importance': rf_selector.feature_importances_
})
importance_df = importance_df.sort_values('importance', ascending=False)

# Select top features based on cumulative importance
cumulative_importance = importance_df['importance'].cumsum()
n_features = (cumulative_importance <= 0.95).sum()
top_features = importance_df['feature'].head(max(n_features, 10)).tolist()

X_selected = X_uncorr[top_features]

# Create final pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced'))
])

# Cross-validation setup
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation for selected features
y_pred = cross_val_predict(pipeline, X_selected, y, cv=kf)

# Calculate metrics
f1 = f1_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

# Log results with MLFlow
with mlflow.start_run(run_name="Experiment 4: Optimized Feature Selection"):
    mlflow.log_param("original_features", X.shape[1])
    mlflow.log_param("selected_features", X_selected.shape[1])
    mlflow.log_param("variance_threshold", 0.01)
    mlflow.log_param("correlation_threshold", 0.95)
    mlflow.log_param("importance_threshold", 0.95)
    
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("true_positives", tp)
    mlflow.log_metric("true_negatives", tn)
    mlflow.log_metric("false_positives", fp)
    mlflow.log_metric("false_negatives", fn)
    
    importance_df.to_csv('feature_importance.csv', index=False)
    mlflow.log_artifact('feature_importance.csv')
    
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="feature_selection_model_optimized",
        registered_model_name="FeatureSelection-Optimized-Experiment-4"
    )

# Print comprehensive results
print("\nExperiment #4 Completed:")
print("-" * 40)
print(f"10-Fold F1 Score: Mean={f1:.4f}")
print(f"10-Fold Accuracy Score: Mean={accuracy:.4f}")
print(f"Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")

# Classification Report
classification_report_str = classification_report(y, y_pred, target_names=['0 (Benign)', '1 (Malignant)'])
print("\nClassification Report:")
print(classification_report_str)


In [None]:
import os
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
import warnings

warnings.filterwarnings('ignore')

# Set up MLFlow tracking
MLFLOW_TRACKING_URI = "https://dagshub.com/saivignesh-03/Machinelearning.mlflow"
os.environ['MLFLOW_TRACKING_USERNAME'] = 'saivignesh-03'
os.environ['MLFLOW_TRACKING_PASSWORD'] = '9c78979cb39e1c46900c7f95953a7fcb54a30dee'
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("Machinelearning")

# Load the dataset
data = pd.read_csv('breast-cancer.csv')
X = data.drop(columns=['id', 'diagnosis'])
y = data['diagnosis'].map({'M': 1, 'B': 0})  # Convert diagnosis to binary

# Standardize features before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA
pca = PCA()
X_pca = pca.fit_transform(X_scaled)

# Scree Plot
plt.figure(figsize=(10, 6))
explained_variance_ratio = pca.explained_variance_ratio_
plt.plot(range(1, len(explained_variance_ratio) + 1), explained_variance_ratio, marker='o', linestyle='--')
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Explained Variance Ratio')
plt.grid()
plt.tight_layout()
plt.savefig('scree_plot.png')
plt.show()

# Select number of components to explain 95% variance
cumulative_variance_ratio = np.cumsum(explained_variance_ratio)
n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1
print(f"Number of components selected to explain 95% variance: {n_components}")

# PCA with selected components
pca = PCA(n_components=n_components)
X_pca_selected = pca.fit_transform(X_scaled)

# Final Pipeline with PCA and Classifier
pipeline = Pipeline([
    ('scaler', StandardScaler()),  # Ensure standardization in the pipeline
    ('pca', PCA(n_components=n_components)),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight='balanced'))
])

# Cross-validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
y_pred = cross_val_predict(pipeline, X, y, cv=kf)

# Metrics
f1 = f1_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

# Log results in MLFlow
with mlflow.start_run(run_name="Experiment 5: PCA Dimensionality Reduction"):
    mlflow.log_param("original_features", X.shape[1])
    mlflow.log_param("selected_components", n_components)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("true_positives", tp)
    mlflow.log_metric("true_negatives", tn)
    mlflow.log_metric("false_positives", fp)
    mlflow.log_metric("false_negatives", fn)

    # Save and log the scree plot
    mlflow.log_artifact("scree_plot.png")

    # Log the PCA model
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="pca_dimensionality_reduction_model",
        registered_model_name="PCA-DimensionalityReduction-Experiment-5"
    )

# Print results
print("\nExperiment #5 Completed:")
print("-" * 40)
print(f"Number of Components: {n_components}")
print(f"10-Fold F1 Score: {f1:.4f}")
print(f"10-Fold Accuracy Score: {accuracy:.4f}")
print(f"Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")

# Classification Report
classification_report_str = classification_report(y, y_pred, target_names=['0 (Benign)', '1 (Malignant)'])
print("\nClassification Report:")
print(classification_report_str)


In [None]:
import os
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import warnings
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# Set up MLFlow tracking
MLFLOW_TRACKING_URI = "https://dagshub.com/saivignesh-03/Machinelearning.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "saivignesh-03"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "9c78979cb39e1c46900c7f95953a7fcb54a30dee"
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("Machinelearning")

# Load the dataset
data = pd.read_csv("breast-cancer.csv")
X = data.drop(columns=["id", "diagnosis"])
y = data["diagnosis"].map({"M": 1, "B": 0})  # Convert diagnosis to binary

# Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Base Models for Stacking
base_models = [
    ("random_forest", RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1, class_weight="balanced")),
    ("gradient_boosting", GradientBoostingClassifier(n_estimators=100, random_state=42)),
]

# Meta Model
meta_model = LogisticRegression(max_iter=1000, random_state=42)

# Stacking Classifier
stacking_clf = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5, n_jobs=-1)

# Pipeline with stacking
pipeline = Pipeline([
    ("scaler", StandardScaler()),
    ("stacking", stacking_clf),
])

# Cross-validation setup
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
y_pred = cross_val_predict(pipeline, X_scaled, y, cv=kf)

# Metrics
f1 = f1_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

# Classification Report
classification_report_str = classification_report(y, y_pred, target_names=["0 (Benign)", "1 (Malignant)"])

# Plot confusion matrix
conf_matrix = confusion_matrix(y, y_pred)
plt.figure(figsize=(6, 6))
plt.matshow(conf_matrix, cmap="coolwarm", fignum=1)
plt.title("Confusion Matrix", pad=20)
plt.colorbar()
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("confusion_matrix_experiment_6.png")
plt.close()

# Log results with MLFlow
with mlflow.start_run(run_name="Experiment 6: Stacked Ensemble Model"):
    mlflow.log_param("base_models", [model[0] for model in base_models])
    mlflow.log_param("meta_model", "LogisticRegression")
    mlflow.log_param("random_forest_n_estimators", 100)
    mlflow.log_param("gradient_boosting_n_estimators", 100)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("true_positives", tp)
    mlflow.log_metric("true_negatives", tn)
    mlflow.log_metric("false_positives", fp)
    mlflow.log_metric("false_negatives", fn)

    # Log confusion matrix plot
    mlflow.log_artifact("confusion_matrix_experiment_6.png")

    # Log model
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="stacked_ensemble_model",
        registered_model_name="Stacked-Ensemble-Experiment-6"
    )

# Print results
print("\nExperiment #6 Completed:")
print("-" * 40)
print(f"10-Fold F1 Score: {f1:.4f}")
print(f"10-Fold Accuracy Score: {accuracy:.4f}")
print(f"Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")
print("\nClassification Report:")
print(classification_report_str)


In [None]:
import os
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings("ignore")

# Set up MLFlow tracking
MLFLOW_TRACKING_URI = "https://dagshub.com/saivignesh-03/Machinelearning.mlflow"
os.environ["MLFLOW_TRACKING_USERNAME"] = "saivignesh-03"
os.environ["MLFLOW_TRACKING_PASSWORD"] = "9c78979cb39e1c46900c7f95953a7fcb54a30dee"
mlflow.set_tracking_uri(uri=MLFLOW_TRACKING_URI)
mlflow.set_experiment("Machinelearning")

# Load the dataset
data = pd.read_csv("breast-cancer.csv")
X = data.drop(columns=["id", "diagnosis"])
y = data["diagnosis"].map({"M": 1, "B": 0})  # Convert diagnosis to binary

# Preprocessing
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define individual classifiers
logistic = LogisticRegression(random_state=42, max_iter=1000)
random_forest = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
knn = KNeighborsClassifier()

# Create the Voting Classifier
voting_classifier_model = VotingClassifier(
    estimators=[
        ("logistic", logistic),
        ("random_forest", random_forest),
        ("knn", knn)
    ],
    voting="soft"  # Use soft voting for probabilities
)

# Cross-validation setup
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Perform cross-validation
y_pred = cross_val_predict(voting_classifier_model, X_scaled, y, cv=kf, method="predict")

# Metrics
f1 = f1_score(y, y_pred)
accuracy = accuracy_score(y, y_pred)
tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()

# Classification Report
classification_report_str = classification_report(y, y_pred, target_names=["0 (Benign)", "1 (Malignant)"])

# Confusion Matrix
conf_matrix = confusion_matrix(y, y_pred)
plt.figure(figsize=(6, 6))
plt.matshow(conf_matrix, cmap="coolwarm", fignum=1)
plt.title("Confusion Matrix", pad=20)
plt.colorbar()
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.savefig("confusion_matrix_experiment_7_voting.png")
plt.close()

# Train the model on the entire dataset
voting_classifier_model.fit(X_scaled, y)

# Log results with MLFlow
with mlflow.start_run(run_name="Experiment 7: Voting Classifier"):
    mlflow.log_param("voting", "soft")
    mlflow.log_param("models_used", ["LogisticRegression", "RandomForestClassifier", "KNeighborsClassifier"])
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("true_positives", tp)
    mlflow.log_metric("true_negatives", tn)
    mlflow.log_metric("false_positives", fp)
    mlflow.log_metric("false_negatives", fn)
    
    # Log confusion matrix plot
    mlflow.log_artifact("confusion_matrix_experiment_7_voting.png")
    
    # Log the voting classifier
    mlflow.sklearn.log_model(
        sk_model=voting_classifier_model,
        artifact_path="voting_classifier_model",
        registered_model_name="VotingClassifier-Experiment-7"
    )

# Print results
print("\nExperiment #7 Completed:")
print("-" * 40)
print(f"10-Fold F1 Score: {f1:.4f}")
print(f"10-Fold Accuracy Score: {accuracy:.4f}")
print(f"Confusion Matrix: TP={tp}, TN={tn}, FP={fp}, FN={fn}")
print("\nClassification Report:")
print(classification_report_str)


In [None]:
import mlflow
import pandas as pd
import matplotlib.pyplot as plt

# Ensure inline plots in Jupyter Notebook
%matplotlib inline

# Set experiment name
experiment_name = "Machinelearning"  # Replace with your experiment name
experiment = mlflow.get_experiment_by_name(experiment_name)

if not experiment:
    print(f"Experiment '{experiment_name}' not found. Please check the name.")
else:
    experiment_id = experiment.experiment_id

    # Retrieve all runs for the experiment
    runs = mlflow.search_runs(experiment_ids=[experiment_id])

    # Debug: Print all columns to check available metrics and parameters
    print("Columns in the runs DataFrame:")
    print(runs.columns)

    # Select metrics of interest based on available metrics in your MLflow
    metrics_of_interest = ['f1_score', 'accuracy']  # Add or adjust as needed
    if not all([f'metrics.{m}' in runs.columns for m in metrics_of_interest]):
        print(f"Some metrics in {metrics_of_interest} are not found in the runs.")
    else:
        # Select and rename columns
        comparison_df = runs[['run_id', 'tags.mlflow.runName'] + [f'metrics.{m}' for m in metrics_of_interest]]
        comparison_df.rename(columns={
            'tags.mlflow.runName': 'run_name',
            'metrics.f1_score': 'f1_score',
            'metrics.accuracy': 'accuracy'
        }, inplace=True)

        # Drop rows without required metrics
        comparison_df.dropna(subset=['f1_score', 'accuracy'], inplace=True)

        if comparison_df.empty:
            print("No valid runs found with the required metrics.")
        else:
            # Print the comparison DataFrame
            print("Comparison of Experiments:")
            print(comparison_df[['run_name', 'f1_score', 'accuracy']])

            # Plot F1-score Comparison
            comparison_df.sort_values(by='f1_score', ascending=False, inplace=True)
            plt.figure(figsize=(12, 7))
            plt.barh(comparison_df['run_name'], comparison_df['f1_score'], color='skyblue')
            plt.xlabel('F1-score', fontsize=12)
            plt.ylabel('Experiments', fontsize=12)
            plt.title('F1-score Comparison of Experiments', fontsize=14)
            plt.gca().invert_yaxis()  # Show the best at the top
            plt.grid(axis='x', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.savefig('f1_score_comparison.png')
            plt.show()

            # Plot Accuracy Comparison
            comparison_df.sort_values(by='accuracy', ascending=False, inplace=True)
            plt.figure(figsize=(12, 7))
            plt.barh(comparison_df['run_name'], comparison_df['accuracy'], color='lightgreen')
            plt.xlabel('Accuracy', fontsize=12)
            plt.ylabel('Experiments', fontsize=12)
            plt.title('Accuracy Comparison of Experiments', fontsize=14)
            plt.gca().invert_yaxis()
            plt.grid(axis='x', linestyle='--', alpha=0.7)
            plt.tight_layout()
            plt.savefig('accuracy_comparison.png')
            plt.show()


In [None]:
import pandas as pd

# Data for comparison
data = {
    'run_name': [
        "Experiment 7: Voting Classifier",
        "Experiment 6: Stacked Ensemble Model",
        "Experiment 5: PCA Dimensionality Reduction",
        "Experiment 4: Optimized Feature Selection"
    ],
    'f1_score': [0.961353, 0.949640, 0.940898, 0.926366],
    'accuracy': [0.971880, 0.963093, 0.956063, 0.945518]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Find the best model based on F1 score and Accuracy
best_model = df.loc[df['f1_score'].idxmax()]
print("Best Model Based on F1 Score:")
print(best_model)


In [None]:
import joblib

# Load the trained model from MLFlow or use the existing instance
# If `voting_classifier_model` is the already trained model instance:
joblib.dump(voting_classifier_model, "final_model.joblib")

print("Model saved as 'final_model.joblib'")


In [None]:
print(voting_classifier_model)


In [None]:
import joblib

# Load the saved model
loaded_model = joblib.load("final_model.joblib")

print("Model loaded successfully!")


In [None]:
import numpy as np
import joblib

# Load the trained Voting Classifier model
voting_clf = joblib.load("final_model.joblib")

# Load the original scaler used during training
scaler = joblib.load("scaler.joblib")  # Ensure the scaler was saved during training

# Example malignant sample data (ensure 30 features match the training features)
sample_data = np.array([[
    25.0, 30.0, 150.0, 1200.0, 0.160, 0.320, 0.450, 0.250, 0.350, 0.120,  # Mean features
    0.300, 0.400, 0.500, 0.700, 0.080, 0.200, 0.300, 0.400, 0.600, 0.090,  # Standard error features
    20.0, 35.0, 170.0, 1400.0, 0.190, 0.350, 0.480, 0.270, 0.400, 0.150   # Worst features
]])  # Ensure the dimensions match the original feature set (30 features)

# Scale the sample data using the loaded scaler
sample_data_scaled = scaler.transform(sample_data)

# Predict using the loaded Voting Classifier
prediction = voting_clf.predict(sample_data_scaled)

# Output prediction
if prediction[0] == 1:
    print("Prediction: Malignant (1)")
else:
    print("Prediction: Benign (0)")


In [None]:
joblib.dump(scaler, "scaler.joblib")


In [None]:
import numpy as np
import joblib

# Load the trained Voting Classifier model
voting_clf = joblib.load("final_model.joblib")

# Load the original scaler used during training
scaler = joblib.load("scaler.joblib")  # Ensure the scaler was saved during training

# Example malignant sample data (ensure 30 features match the training features)
sample_data = np.array([[
    25.0, 30.0, 150.0, 1200.0, 0.160, 0.320, 0.450, 0.250, 0.350, 0.120,  # Mean features
    0.300, 0.400, 0.500, 0.700, 0.080, 0.200, 0.300, 0.400, 0.600, 0.090,  # Standard error features
    20.0, 35.0, 170.0, 1400.0, 0.190, 0.350, 0.480, 0.270, 0.400, 0.150   # Worst features
]])  # Ensure the dimensions match the original feature set (30 features)

# Scale the sample data using the loaded scaler
sample_data_scaled = scaler.transform(sample_data)

# Predict using the loaded Voting Classifier
prediction = voting_clf.predict(sample_data_scaled)

# Output prediction
if prediction[0] == 1:
    print("Prediction: Malignant (1)")
else:
    print("Prediction: Benign (0)")


In [None]:
import numpy as np

# Example malignant sample data with 30 features
sample_data = np.array([[
    25.0, 30.0, 150.0, 1200.0, 0.160, 0.320, 0.450, 0.250, 0.350, 0.120,  # Mean features
    0.300, 0.400, 0.500, 0.700, 0.080, 0.200, 0.300, 0.400, 0.600, 0.090,  # Standard error features
    20.0, 35.0, 170.0, 1400.0, 0.190, 0.350, 0.480, 0.270, 0.400, 0.150   # Worst features
]])

print(sample_data.shape)  # Ensure the dimensions are correct (1, 30)


In [None]:
import numpy as np
import joblib

# Load the trained Voting Classifier model
voting_clf = joblib.load("final_model.joblib")

# Load the original scaler used during training
scaler = joblib.load("scaler.joblib")  # Ensure the scaler was saved during training

# Example malignant sample data (ensure 30 features match the training features)
sample_data = np.array([[
    28.0, 35.0, 180.0, 1500.0, 0.200, 0.400, 0.550, 0.300, 0.450, 0.150,  # Mean features
    0.350, 0.450, 0.600, 0.800, 0.100, 0.250, 0.350, 0.500, 0.700, 0.120,  # Standard error features
    25.0, 40.0, 200.0, 1800.0, 0.250, 0.450, 0.600, 0.400, 0.550, 0.200   # Worst features
]])  # Ensure the dimensions match the original feature set (30 features)

# Scale the sample data using the loaded scaler
sample_data_scaled = scaler.transform(sample_data)

# Predict using the loaded Voting Classifier
prediction = voting_clf.predict(sample_data_scaled)

# Output prediction
if prediction[0] == 1:
    print("Prediction: Malignant (1)")
else:
    print("Prediction: Benign (0)")


In [None]:
pip install fastapi uvicorn pydantic joblib


In [None]:
import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import numpy as np
import joblib

# Initialize FastAPI app
app = FastAPI()

# Load the trained model and scaler
try:
    voting_clf = joblib.load("final_model.joblib")
    scaler = joblib.load("scaler.joblib")
except FileNotFoundError as e:
    raise RuntimeError(f"Required model or scaler file is missing: {e}")

# Define feature names
feature_names = [
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
    "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
    "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
    "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

# Define the input schema
class BreastCancerPredictionInput(BaseModel):
    inputs: dict[str, float] = Field(
        ...,
        description="Dictionary of feature names and their corresponding values.",
        example={name: 0.0 for name in feature_names},
    )

# Root endpoint
@app.get("/")
async def root():
    return {"message": "Welcome to the Breast Cancer Prediction API!"}

# Endpoint to retrieve feature names
@app.get("/features/")
async def get_features():
    """Returns the feature names expected for the prediction."""
    return {"feature_names": feature_names}

# Prediction endpoint
@app.post("/predict/")
async def predict(input_data: BreastCancerPredictionInput):
    input_features = input_data.inputs

    # Validate missing features
    missing_features = [feature for feature in feature_names if feature not in input_features]
    if missing_features:
        raise HTTPException(
            status_code=400,
            detail=f"Missing features: {missing_features}",
        )

    # Validate extra features
    extra_features = [feature for feature in input_features if feature not in feature_names]
    if extra_features:
        raise HTTPException(
            status_code=400,
            detail=f"Unexpected features: {extra_features}",
        )

    try:
        # Extract feature values in the correct order
        feature_values = [input_features[feature] for feature in feature_names]
        sample_data = np.array([feature_values])

        # Scale the input
        sample_data_scaled = scaler.transform(sample_data)

        # Predict using the model
        prediction = voting_clf.predict(sample_data_scaled)

        return {"prediction": "Malignant (1)" if prediction[0] == 1 else "Benign (0)"}
    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=f"Prediction failed: {str(e)}"
        )


if __name__ == "__main__":
    import uvicorn
    import nest_asyncio  # Required for running in Jupyter

    # Apply nest_asyncio to allow uvicorn to run in Jupyter's event loop
    nest_asyncio.apply()

    # Start the FastAPI server
    print("Starting FastAPI server... Access the API docs at http://127.0.0.1:8000/docs")
    uvicorn.run(app, host="0.0.0.0", port=8000)


In [None]:
pip install nest_asyncio


In [None]:
pip install streamlit requests


In [None]:
import os
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
import numpy as np
import joblib

# Initialize FastAPI app
app = FastAPI()

# Load the trained model and scaler
try:
    voting_clf = joblib.load("final_model.joblib")
    scaler = joblib.load("scaler.joblib")
except FileNotFoundError as e:
    raise RuntimeError(f"Required model or scaler file is missing: {e}")

# Define feature names
feature_names = [
    "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean",
    "compactness_mean", "concavity_mean", "concave_points_mean", "symmetry_mean", "fractal_dimension_mean",
    "radius_se", "texture_se", "perimeter_se", "area_se", "smoothness_se",
    "compactness_se", "concavity_se", "concave_points_se", "symmetry_se", "fractal_dimension_se",
    "radius_worst", "texture_worst", "perimeter_worst", "area_worst", "smoothness_worst",
    "compactness_worst", "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"
]

# Define default values
default_values = {
    "radius_mean": 28.0,
    "texture_mean": 35.0,
    "perimeter_mean": 180.0,
    "area_mean": 1500.0,
    "smoothness_mean": 0.200,
    "compactness_mean": 0.400,
    "concavity_mean": 0.550,
    "concave_points_mean": 0.300,
    "symmetry_mean": 0.450,
    "fractal_dimension_mean": 0.150,
    "radius_se": 0.350,
    "texture_se": 0.450,
    "perimeter_se": 0.600,
    "area_se": 0.800,
    "smoothness_se": 0.100,
    "compactness_se": 0.250,
    "concavity_se": 0.350,
    "concave_points_se": 0.500,
    "symmetry_se": 0.700,
    "fractal_dimension_se": 0.120,
    "radius_worst": 25.0,
    "texture_worst": 40.0,
    "perimeter_worst": 200.0,
    "area_worst": 1800.0,
    "smoothness_worst": 0.250,
    "compactness_worst": 0.450,
    "concavity_worst": 0.600,
    "concave_points_worst": 0.400,
    "symmetry_worst": 0.550,
    "fractal_dimension_worst": 0.200
}

# Define the input schema
class BreastCancerPredictionInput(BaseModel):
    inputs: dict[str, float] = Field(
        ...,
        description="Dictionary of feature names and their corresponding values.",
        example=default_values,  # Use default values here
    )

# Root endpoint
@app.get("/")
async def root():
    return {"message": "Welcome to the Breast Cancer Prediction API!"}

# Endpoint to retrieve feature names
@app.get("/features/")
async def get_features():
    """Returns the feature names expected for the prediction."""
    return {"feature_names": feature_names}

# Prediction endpoint
@app.post("/predict/")
async def predict(input_data: BreastCancerPredictionInput):
    input_features = input_data.inputs

    # Validate missing features
    missing_features = [feature for feature in feature_names if feature not in input_features]
    if missing_features:
        raise HTTPException(
            status_code=400,
            detail=f"Missing features: {missing_features}",
        )

    # Validate extra features
    extra_features = [feature for feature in input_features if feature not in feature_names]
    if extra_features:
        raise HTTPException(
            status_code=400,
            detail=f"Unexpected features: {extra_features}",
        )

    try:
        # Extract feature values in the correct order
        feature_values = [input_features[feature] for feature in feature_names]
        sample_data = np.array([feature_values])

        # Scale the input
        sample_data_scaled = scaler.transform(sample_data)

        # Predict using the model
        prediction = voting_clf.predict(sample_data_scaled)

        return {"prediction": "Malignant (1)" if prediction[0] == 1 else "Benign (0)"}
    except Exception as e:
        raise HTTPException(
            status_code=500,
            detail=f"Prediction failed: {str(e)}"
        )


if __name__ == "__main__":
    import uvicorn
    import nest_asyncio  # Required for running in Jupyter

    # Apply nest_asyncio to allow uvicorn to run in Jupyter's event loop
    nest_asyncio.apply()

    # Start the FastAPI server
    print("Starting FastAPI server... Access the API docs at http://127.0.0.1:8000/docs")
    uvicorn.run(app, host="0.0.0.0", port=8000)
