***Load and Inspect the Data***

In [None]:
# Import necessary libraries
import pandas as pd

# Load the dataset
data = pd.read_csv('/content/breastcancer.csv')

# Display the first few rows of the dataset to understand its structure
data.head()


Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


***Step 2: Data Preprocessing***

In [None]:
# Checking for any missing values in the dataset
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

# Summary statistics to understand the dataset better
data.describe()

# Checking the distribution of the target variable
target_distribution = data['diagnosis'].value_counts()
print("\nDistribution of the target variable:\n", target_distribution)

Missing values in each column:
 id                         0
diagnosis                  0
radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

Distribution of the target

***Step 3: Data Preprocessing Steps***

In [None]:
# Encode the target variable 'diagnosis'
data['diagnosis'] = data['diagnosis'].map({'M': 1, 'B': 0})

In [None]:
# Drop the 'id' column as it's not useful for prediction
X = data.drop(['id', 'diagnosis'], axis=1)
y = data['diagnosis']

In [None]:
# Standardizing the feature variables
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

***Step 4: Model Training and Evaluation***

In [None]:
# Train-test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Importing models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Logistic Regression Model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
print(f"Logistic Regression Accuracy: {accuracy_score(y_test, y_pred_log)}")
print(classification_report(y_test, y_pred_log))

# Decision Tree Model
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
print(f"Decision Tree Accuracy: {accuracy_score(y_test, y_pred_tree)}")
print(classification_report(y_test, y_pred_tree))

# Support Vector Machine Model
svc = SVC()
svc.fit(X_train, y_train)
y_pred_svc = svc.predict(X_test)
print(f"SVM Accuracy: {accuracy_score(y_test, y_pred_svc)}")
print(classification_report(y_test, y_pred_svc))

Logistic Regression Accuracy: 0.9736842105263158
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   macro avg       0.97      0.97      0.97       114
weighted avg       0.97      0.97      0.97       114

Decision Tree Accuracy: 0.9385964912280702
              precision    recall  f1-score   support

           0       0.94      0.96      0.95        71
           1       0.93      0.91      0.92        43

    accuracy                           0.94       114
   macro avg       0.94      0.93      0.93       114
weighted avg       0.94      0.94      0.94       114

SVM Accuracy: 0.9736842105263158
              precision    recall  f1-score   support

           0       0.97      0.99      0.98        71
           1       0.98      0.95      0.96        43

    accuracy                           0.97       114
   

***Step 5: Save the Best Model***

In [None]:
import pickle

# Save the trained model to a file
model_filename = 'breast_cancer_model.pkl'
with open(model_filename, 'wb') as file:
    pickle.dump(model, file)

# Save the scaler to a file
scaler_filename = 'scaler.pkl'
with open(scaler_filename, 'wb') as file:
    pickle.dump(scaler, file)

# Verify the files are saved in Colab
print(f"Model saved as {model_filename}")
print(f"Scaler saved as {scaler_filename}")

Model saved as breast_cancer_model.pkl
Scaler saved as scaler.pkl


***Step 6: Prediction Function***

In [None]:
# Load the saved model for prediction
model_path = '/content/breast_cancer_model.pkl'
loaded_model = pickle.load(open(model_path, 'rb'))

# Function to make predictions
def predict_breast_cancer(input_data):
    # Convert input data to numpy array and reshape for prediction
    input_array = np.array(input_data).reshape(1, -1)

    # Standardize the input data using the same scaler used during training
    input_scaled = scaler.transform(input_array)

    # Make prediction
    prediction = loaded_model.predict(input_scaled)
    return 'Malignant' if prediction == 1 else 'Benign'

# Example prediction
sample_input = [17.99, 10.38, 122.8, 1001, 0.1184, 0.2776, 0.3001, 0.1471, 0.2419, 0.07871,
                1.095, 0.9053, 8.589, 153.4, 0.006399, 0.04904, 0.05373, 0.01587, 0.03003,
                0.006193, 25.38, 17.33, 184.6, 2019, 0.1622, 0.6656, 0.7119, 0.2654, 0.4601, 0.1189]
print(f"Prediction for the sample input: {predict_breast_cancer(sample_input)}")


Prediction for the sample input: Benign


