In [26]:
import pandas as pd
import sklearn
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
import pickle as pk

In [27]:
def data_cleaner():
    data = pd.read_csv("../data/data.csv", sep=",")
    data.drop(["id","Unnamed: 32"], inplace=True, axis=1)
    
    return data

In [28]:
def create_model(data):

    # Data transform
    X = data.drop("diagnosis", axis=1)
    y = data["diagnosis"]
    y = y.map({"M":1, "B":0})

    colonnes = []
    for colonne in X.columns:
        colonnes.append( (colonne.replace("_mean", " (mean)").replace("_se"," "
        "(se)").replace("_worst"," (worst)").capitalize(), colonne, X[colonne].min(), X[colonne].mean(), X[colonne].max()))

    # Data scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    # Data split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Model creation
    lr = LogisticRegression()
    model = lr.fit(X_train, y_train)

    # Model Evaluation
    y_pred = model.predict(X_test)
    print(f"Accuracy Score: {accuracy_score(y_test, y_pred)}")
    print(f"Classification report : \n {classification_report(y_test, y_pred)}")

    return model, scaler, colonnes
    

In [29]:
def main():
    data = data_cleaner()
    model, scaler, labels = create_model(data)

    with open("model.pkl", "wb") as f:
        pk.dump(model, f)
    with open("scaler.pkl", "wb") as g:
        pk.dump(scaler, g)
    with open("../data/labels.pkl","wb") as l:
        pk.dump(labels, l)
    

In [30]:
if __name__ == "__main__":
    main()

Accuracy Score: 0.9736842105263158
Classification report : 
               precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

