In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
# Load the dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
names = ["id_number", "diagnosis", "radius_mean", "texture_mean", "perimeter_mean", "area_mean", 
         "smoothness_mean", "compactness_mean", "concavity_mean", "concave_points_mean", 
         "symmetry_mean", "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", 
         "area_se", "smoothness_se", "compactness_se", "concavity_se", "concave_points_se", 
         "symmetry_se", "fractal_dimension_se", "radius_worst", "texture_worst", 
         "perimeter_worst", "area_worst", "smoothness_worst", "compactness_worst", 
         "concavity_worst", "concave_points_worst", "symmetry_worst", "fractal_dimension_worst"]
data = pd.read_csv(url, names=names)

In [None]:
# Preprocessing
# Drop id_number column as it's not relevant for prediction
data.drop("id_number", axis=1, inplace=True)

In [None]:
# Encode diagnosis (M: Malignant, B: Benign) to numerical values (0: Benign, 1: Malignant)
data["diagnosis"] = data["diagnosis"].map({"M": 1, "B": 0})

In [None]:
# Separate features (X) and target (y)
X = data.drop("diagnosis", axis=1)
y = data["diagnosis"]

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Feature scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
# Feature selection
selector = SelectKBest(f_classif, k=10)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

In [None]:
# Build and train Random Forest model
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_selected, y_train)

In [None]:
# Predictions
y_pred_train = rf_model.predict(X_train_selected)
y_pred_test = rf_model.predict(X_test_selected)

In [None]:
# Model evaluation
train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

In [None]:
print("Training Accuracy:", train_accuracy)
print("Testing Accuracy:", test_accuracy)

In [3]:
# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_test))

Training Accuracy: 0.9978021978021978
Testing Accuracy: 0.956140350877193

Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.97      0.97        71
           1       0.95      0.93      0.94        43

    accuracy                           0.96       114
   macro avg       0.96      0.95      0.95       114
weighted avg       0.96      0.96      0.96       114



In [5]:
import joblib

joblib.dump(rf_model,"model_orignal.sav")

['model_orignal.sav']