In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

In [3]:
# Load the dataset
with open('breast-cancer.csv', 'r') as file:
    df = pd.read_csv(file)
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
# Check the Dataset Shape
df.shape

(569, 32)

In [5]:
# Get the information about the Data
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [6]:
# Drop 'id' column
df = df.drop('id', axis = 1)

In [7]:
df.sample(5)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
125,B,13.85,17.21,88.44,588.7,0.08785,0.06136,0.0142,0.01141,0.1614,...,15.49,23.58,100.3,725.9,0.1157,0.135,0.08115,0.05104,0.2364,0.07182
529,B,12.07,13.44,77.83,445.2,0.11,0.09009,0.03781,0.02798,0.1657,...,13.45,15.77,86.92,549.9,0.1521,0.1632,0.1622,0.07393,0.2781,0.08052
284,B,12.89,15.7,84.08,516.6,0.07818,0.0958,0.1115,0.0339,0.1432,...,13.9,19.69,92.12,595.6,0.09926,0.2317,0.3344,0.1017,0.1999,0.07127
476,B,14.2,20.53,92.41,618.4,0.08931,0.1108,0.05063,0.03058,0.1506,...,16.45,27.26,112.1,828.5,0.1153,0.3429,0.2512,0.1339,0.2534,0.07858
466,B,13.14,20.74,85.98,536.9,0.08675,0.1089,0.1085,0.0351,0.1562,...,14.8,25.46,100.9,689.1,0.1351,0.3549,0.4504,0.1181,0.2563,0.08174


In [8]:
df.shape

(569, 31)

In [9]:
# Encode the 'diagnosis' column (assuming it's categorical)
df['diagnosis'] = df['diagnosis'].map({'B': 0, 'M': 1})
df.sample(5)

Unnamed: 0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
273,0,9.742,15.67,61.5,289.9,0.09037,0.04689,0.01103,0.01407,0.2081,...,10.75,20.88,68.09,355.2,0.1467,0.0937,0.04043,0.05159,0.2841,0.08175
296,0,10.91,12.35,69.14,363.7,0.08518,0.04721,0.01236,0.01369,0.1449,...,11.37,14.82,72.42,392.2,0.09312,0.07506,0.02884,0.03194,0.2143,0.06643
285,0,12.58,18.4,79.83,489.0,0.08393,0.04216,0.00186,0.002924,0.1697,...,13.5,23.08,85.56,564.1,0.1038,0.06624,0.005579,0.008772,0.2505,0.06431
479,1,16.25,19.51,109.8,815.8,0.1026,0.1893,0.2236,0.09194,0.2151,...,17.39,23.05,122.1,939.7,0.1377,0.4462,0.5897,0.1775,0.3318,0.09136
94,1,15.06,19.83,100.3,705.6,0.1039,0.1553,0.17,0.08815,0.1855,...,18.23,24.23,123.5,1025.0,0.1551,0.4203,0.5203,0.2115,0.2834,0.08234


In [10]:
# Split the data into features (X) and target variable (Y)
X = df.drop('diagnosis', axis =1)
Y = df['diagnosis']

In [11]:
# Check for Missing Values
X.isnull().sum()

radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [12]:
Y.isnull().sum()

0

In [13]:
# There is no null values but if we want to replace them by mean values we can use SimpleImputer for that.
# imputer = SimpleImputer(strategy='mean')
# X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [14]:
# Standardize the features
scaler = StandardScaler()
x_scaled = pd.DataFrame(scaler.fit_transform(X), columns =X.columns)

In [15]:
# Split the data into train and test
X_train, X_test, y_train, y_test = train_test_split(x_scaled, Y, test_size = 0.2, random_state=42)

In [16]:
# Feature Selection Methods
# 1. Filter Method - SelectKBest with ANOVA
selector_anova = SelectKBest(score_func=f_classif, k=10)
X_train_anova = selector_anova.fit_transform(X_train, y_train)
X_test_anova = selector_anova.fit_transform(X_test, y_test)

In [17]:
# 2. Wrapper Method - Recursive Feature Elimination (RFE) with RandomForestClassifier
selector_rfe = RandomForestClassifier()
selector_rfe.fit(X_train, y_train)
feature_importances_rfe = selector_rfe.feature_importances_
selected_features_rfe = X_train.columns[feature_importances_rfe.argsort()[::-1]][:10]
X_train_rfe = X_train[selected_features_rfe]
X_test_rfe = X_test[selected_features_rfe]

In [18]:
# 3. Embedded Method - Feature Importance from RandomForestClassifier
selector_rf_importance = RandomForestClassifier()
selector_rf_importance.fit(X_train, y_train)
feature_importance = pd.Series(selector_rf_importance.feature_importances_, index=X.columns)
selected_features_rf = feature_importance.nlargest(10).index
X_train_rf_importance = X_train[selected_features_rf]
X_test_rf_importance = X_test[selected_features_rf]

In [19]:
classifiers = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC()
}

In [20]:
# Train and evaluate models
results = {}
trained_models = {}
for name, clf in classifiers.items():
    clf.fit(X_train, y_train)
    trained_models[name] = clf
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    classification_rep = classification_report(y_test, y_pred)
    results[name] = {'Accuracy': accuracy, 'Classification Report': classification_rep}

In [21]:
# Display results
for name, result in results.items():
    print(f"Results for {name}:")
    print(f"Accuracy: {result['Accuracy']:.4f}")
    print("Classification Report:\n", result['Classification Report'])
    print("-" * 50)

Results for Logistic Regression:
Accuracy: 0.9737
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

--------------------------------------------------
Results for Random Forest:
Accuracy: 0.9649
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97        71
           1       0.98      0.93      0.95        43

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114

--------------------------------------------------
Results for Support Vector Machine:
Accuracy: 0.9737
Classification Report:
               precision  

In [22]:
import pickle
with open('trained_models.pkl', 'wb') as file:
    pickle.dump(trained_models, file)