In [2]:
import pandas as pd
import numpy as np
import sqlite3
import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import mlflow
import mlflow.sklearn

# File path for the dataset
file_path = '/Users/rajasaikatukuri/Downloads/pythonproject/breast-cancer.csv'
data = pd.read_csv(file_path)

# Data Normalization Process
# Establish SQLite connection
conn = sqlite3.connect("breast_cancer_normalized.db")
cursor = conn.cursor()

# Define the SQL schema for normalization
# Table 1: Patients
cursor.execute("""
CREATE TABLE IF NOT EXISTS Patients (
    PatientID INTEGER PRIMARY KEY,
    Diagnosis TEXT
);
""")

# Table 2: Measurements
cursor.execute("""
CREATE TABLE IF NOT EXISTS Measurements (
    PatientID INTEGER,
    MeasurementType TEXT,
    Value REAL,
    FOREIGN KEY (PatientID) REFERENCES Patients(PatientID)
);
""")

# Insert data into Patients table
patients_data = data[['id', 'diagnosis']].rename(columns={'id': 'PatientID', 'diagnosis': 'Diagnosis'})
patients_data.to_sql('Patients', conn, if_exists='replace', index=False)

# Transform measurements into a long format and insert into Measurements table
measurements = data.drop(columns=['id', 'diagnosis'])
measurements['PatientID'] = data['id']
measurements_long = measurements.melt(
    id_vars=['PatientID'],
    var_name="MeasurementType",
    value_name="Value"
)
measurements_long.to_sql('Measurements', conn, if_exists='replace', index=False)


# Fetch data for ML
query = """
SELECT Patients.PatientID, Patients.Diagnosis, Measurements.MeasurementType, Measurements.Value
FROM Patients
JOIN Measurements ON Patients.PatientID = Measurements.PatientID
"""
data_normalized = pd.read_sql_query(query, conn)

# Pivot the data back for exploration
data_pivoted = data_normalized.pivot_table(index='PatientID', columns='MeasurementType', values='Value', aggfunc='mean')
data_pivoted['Diagnosis'] = data_normalized.groupby('PatientID')['Diagnosis'].first()

# Encode target variable
data_pivoted['Diagnosis'] = data_pivoted['Diagnosis'].map({'M': 1, 'B': 0})

# Check for missing values
missing_values = data_pivoted.isnull().sum()

# Handle missing values (e.g., fill with mean or drop rows with missing values)
data_pivoted.fillna(data_pivoted.mean(), inplace=True)

data_pivoted.rename(columns={
    'concave points_mean': 'concave_points_mean',
    'concave points_se': 'concave_points_se',
    'concave points_worst': 'concave_points_worst'
}, inplace=True)
print("Before",data_pivoted.columns)
# Split the data
X = data_pivoted.drop(columns=['Diagnosis'])
y = data_pivoted['Diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Add meaningful features
X_train['area_to_radius_mean'] = X_train['area_mean'] / X_train['radius_mean']
X_test['area_to_radius_mean'] = X_test['area_mean'] / X_test['radius_mean']

X_train['radius_mean_squared'] = X_train['radius_mean'] ** 2
X_test['radius_mean_squared'] = X_test['radius_mean'] ** 2

X_train['log_area_mean'] = np.log1p(X_train['area_mean'])
X_test['log_area_mean'] = np.log1p(X_test['area_mean'])
print("after feature engineering",X_train.columns)
X_train = X_train.astype(float)
X_test = X_test.astype(float)

import dagshub
dagshub.init(repo_owner='rajasaikatukuri', repo_name='pythonproject', mlflow=True)
# MLFlow setup
mlflow.set_tracking_uri("https://dagshub.com/rajasaikatukuri/pythonproject.mlflow")
mlflow.set_experiment("Logistic Regression Best Model")

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=5000, random_state=42))
])

# Train and evaluate using cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='f1')
mean_cv_f1 = np.mean(cv_scores)
std_cv_f1 = np.std(cv_scores)

# Log metrics to MLFlow
with mlflow.start_run(run_name="Logistic Regression Best Model"):
    pipeline.fit(X_train, y_train)

    # Evaluate on test set
    y_pred = pipeline.predict(X_test)
    test_f1 = f1_score(y_test, y_pred)

    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()

    # Log metrics
    mlflow.log_metric("f1_score_mean", mean_cv_f1)
    mlflow.log_metric("f1_score_std", std_cv_f1)
    mlflow.log_metric("test_f1_score", test_f1)
    mlflow.log_metric("True_Positives", tp)
    mlflow.log_metric("True_Negatives", tn)
    mlflow.log_metric("False_Positives", fp)
    mlflow.log_metric("False_Negatives", fn)

    # Save the model
    joblib_file = "/Users/rajasaikatukuri/Downloads/pythonproject/logistic_regression_best.joblib"
    joblib.dump(pipeline, joblib_file)
    print(f"Model saved as {joblib_file}")

    # Log the model to MLFlow
    input_example = pd.DataFrame(X_test.iloc[0:1])
    mlflow.sklearn.log_model(
        sk_model=pipeline,
        artifact_path="model",
        input_example=input_example.to_dict(orient='records')[0],
        registered_model_name="Logistic_Regression_Best_Model"
    )

# Confirm the model file is ready for download
print("Joblib model file ready: logistic_regression_pipeline.joblib")


Patients Table:
   PatientID Diagnosis
0     842302         M
1     842517         M
2   84300903         M
3   84348301         M
4   84358402         M

Measurements Table:
   PatientID MeasurementType  Value
0     842302     radius_mean  17.99
1     842517     radius_mean  20.57
2   84300903     radius_mean  19.69
3   84348301     radius_mean  11.42
4   84358402     radius_mean  20.29

Distinct Measurement Types:
            MeasurementType
0               radius_mean
1              texture_mean
2            perimeter_mean
3                 area_mean
4           smoothness_mean
5          compactness_mean
6            concavity_mean
7       concave points_mean
8             symmetry_mean
9    fractal_dimension_mean
10                radius_se
11               texture_se
12             perimeter_se
13                  area_se
14            smoothness_se
15           compactness_se
16             concavity_se
17        concave points_se
18              symmetry_se
19     fractal_dimen

2024/12/20 17:28:02 INFO mlflow.tracking.fluent: Experiment with name 'Logistic Regression Best Model' does not exist. Creating a new experiment.


Model saved as /Users/rajasaikatukuri/Downloads/pythonproject/logistic_regression_best.joblib


Successfully registered model 'Logistic_Regression_Best_Model'.
2024/12/20 17:28:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: Logistic_Regression_Best_Model, version 1
Created version '1' of model 'Logistic_Regression_Best_Model'.


🏃 View run Logistic Regression Best Model at: https://dagshub.com/rajasaikatukuri/pythonproject.mlflow/#/experiments/9/runs/4a46294dc30b48cfaddd90bc2cd605f9
🧪 View experiment at: https://dagshub.com/rajasaikatukuri/pythonproject.mlflow/#/experiments/9
Joblib model file ready: logistic_regression_pipeline.joblib


In [3]:
from fastapi import FastAPI
from pydantic import BaseModel
import joblib
import numpy as np

# Load the saved model
model = joblib.load("/Users/rajasaikatukuri/Downloads/pythonproject/final_model.pkl")




In [4]:


# Initialize the FastAPI app
app = FastAPI()

# Define the input schema using Pydantic with all dataset features
class InputFeatures(BaseModel):
    mean_radius: float
    mean_texture: float
    mean_perimeter: float
    mean_area: float
    mean_smoothness: float
    mean_compactness: float
    mean_concavity: float
    mean_concave_points: float
    mean_symmetry: float
    mean_fractal_dimension: float
    radius_error: float
    texture_error: float
    perimeter_error: float
    area_error: float
    smoothness_error: float
    compactness_error: float
    concavity_error: float
    concave_points_error: float
    symmetry_error: float
    fractal_dimension_error: float
    worst_radius: float
    worst_texture: float
    worst_perimeter: float
    worst_area: float
    worst_smoothness: float
    worst_compactness: float
    worst_concavity: float
    worst_concave_points: float
    worst_symmetry: float
    worst_fractal_dimension: float

# Root endpoint for health check
@app.get("/")
def read_root():
    return {"message": "Breast Cancer Prediction API is running!"}

# Prediction endpoint
@app.post("/predict")
def predict(input_data: InputFeatures):
    # Convert input features into a DataFrame for transformations
    input_df = pd.DataFrame([input_data.dict()])

    # Add meaningful ratios
    input_df['area_to_radius_mean'] = input_df['mean_area'] / input_df['mean_radius']

    # Polynomial features
    input_df['radius_mean_squared'] = input_df['mean_radius'] ** 2

    # Log transformations
    input_df['log_area_mean'] = np.log1p(input_df['mean_area'])

    # Convert the transformed data into a NumPy array
    input_array = input_df.values

    # Perform prediction
    prediction = model.predict(input_array)

    return {"prediction": int(prediction[0])}


In [6]:
!pip install uvicorn




In [9]:
!uvicorn main:app --reload --port 8000


[32mINFO[0m:     Will watch for changes in these directories: ['/Users/rajasaikatukuri/Downloads/pythonproject']
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m14666[0m] using [36m[1mStatReload[0m
[32mINFO[0m:     Started server process [[36m14668[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     127.0.0.1:65500 - "[1mGET / HTTP/1.1[0m" [32m200 OK[0m
[32mINFO[0m:     127.0.0.1:65500 - "[1mGET /favicon.ico HTTP/1.1[0m" [31m404 Not Found[0m
^C
[32mINFO[0m:     Shutting down
[32mINFO[0m:     Waiting for application shutdown.
[32mINFO[0m:     Application shutdown complete.
[32mINFO[0m:     Finished server process [[36m14668[0m]
[32mINFO[0m:     Stopping reloader process [[36m[1m14666[0m]
