In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import joblib

In [None]:
data = pd.read_csv("../data/breast-cancer.csv")

In [None]:
#EDA and Feature Engineering
print("--------------first 5 columns--------------")
print(data.head())
print("--------------Data type of the features--------------")
print(data.info())
print("--------------statistics of the data--------------")
print(data.describe())
print("--------------creates a heatmap to represent null values--------------")
print(sns.heatmap(data.isnull()))

# drop Unnamed: 32 as only Nan, id as it is not needed, so drop it
data.drop(['Unnamed: 32','id'], axis = 1, inplace=True)

# diagnosis is an object, convert to int
data = pd.get_dummies(data, columns=['diagnosis'], drop_first=True)
print("----------final version after EDA------------")
print(data)

In [None]:
X = data.drop(['diagnosis_M'], axis=1, inplace=False)
y = data['diagnosis_M']

In [None]:
# NORMALIZATION - ml models related to gradient descents gives best results when the features have close scale
# scaling - eg: age(0-100), salary($0 - $100000) they have different scale
  # fit - learns the mean and stardard deviation for each feature from the dataset
  # transform applies a formula (x' = x-mew/sigma) it gives a new value for each data with (mean around 0, standard deviation around 1) the new data's now has close scale but the relationship stays the same 
  # eg: age(18 -> -1.31), salary(30000 -> -1.16)
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

joblib.dump(model,"../models/breast_cancer_prediction_model.joblib")
model = joblib.load("../models/breast_cancer_prediction_model.joblib")

y_pred = model.predict(X_test)                          

In [None]:
from sklearn.metrics import accuracy_score, classification_report 

accuracy = accuracy_score(y_test,y_pred)
print(classification_report(y_test,y_pred)) #gives all metrics like precision, recall, f1-score
print(accuracy)