# Breast Cancer Prediction


In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import RobustScaler  # Scaling with robustness to outliers

In [None]:
# Load the dataset
data_path = "/content/BreastCancer.csv"  # Update with your CSV file's path
df = pd.read_csv(data_path)

In [None]:
# Inspecting the data
print("Columns in the dataset:", df.columns)

Columns in the dataset: Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')


In [None]:
# Ensure the target column is properly named and available
target_column = "diagnosis"
if target_column in df.columns:
    print(f"'{target_column}' column found. Proceeding with data preprocessing.")
else:
    raise KeyError(f"'{target_column}' column not found. Available columns are: {df.columns}")

# Drop rows with missing values in target or features
df.dropna(subset=[target_column], inplace=True)

'diagnosis' column found. Proceeding with data preprocessing.


In [None]:
# Prepare features (X) and target (y)
X = df.drop(columns=[target_column, "Unnamed: 32"], errors="ignore")
y = df[target_column].map({"M": 1, "B": 0})

In [None]:
# Confirm data integrity after preprocessing
print("Number of rows before dropping NaNs:", len(df))
print("Number of rows after dropping NaNs:", len(df))
print("Shape of X (features):", X.shape)
print("Shape of y (target):", y.shape)

Number of rows before dropping NaNs: 569
Number of rows after dropping NaNs: 569
Shape of X (features): (569, 31)
Shape of y (target): (569,)


In [None]:
# Split data into train-test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features using RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)  # Fit scaler on training data
X_test = scaler.transform(X_test)  # Apply scaling to testing data

In [None]:
# Train a Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)

# Make predictions
y_pred = rf_model.predict(X_test)

# Evaluate the model performance
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

In [None]:
# Print performance metrics
print("Model Performance Metrics after robust scaling:")
print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

Model Performance Metrics after robust scaling:
Accuracy: 0.96
Precision: 0.98
Recall: 0.93
F1 Score: 0.95


In [5]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

In [6]:
# Load the data
data = pd.read_csv('/content/BreastCancer.csv')

# Preprocess the data
data.drop(['id', 'Unnamed: 32'], axis=1, inplace=True)
le = LabelEncoder()
data['diagnosis'] = le.fit_transform(data['diagnosis'])  # Encode 'Benign' as 0, 'Malignant' as 1

In [7]:
# Prepare features and target
X = data.iloc[:, 1:31].values  # Feature columns (30 features)
y = data['diagnosis'].values  # Target column

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Train the SVM Model
svm_model = SVC(probability=True, kernel='rbf', C=1, gamma='auto')
svm_model.fit(X_train_scaled, y_train)

In [9]:
def predict_cancer():
    print("\nPlease enter the following details:")
    try:
        # Collect user input
        mean_radius = float(input("Mean Radius value: "))
        mean_texture = float(input("Mean Texture value: "))
        mean_perimeter = float(input("Mean Perimeter value: "))
        mean_area = float(input("Mean Area value: "))
        mean_smoothness = float(input("Mean Smoothness value: "))
        mean_compactness = float(input("Mean Compactness value: "))
        mean_concavity = float(input("Mean Concavity value: "))
        mean_concave_points = float(input("Mean Concave Points value: "))
        symmetry = float(input("Mean Symmetry value: "))
        fractal_dimension = float(input("Fractal Dimension value: "))

        # Create a full input array (fill missing features with zeros)
        user_input = [
            mean_radius, mean_texture, mean_perimeter, mean_area,
            mean_smoothness, mean_compactness, mean_concavity,
            mean_concave_points, symmetry, fractal_dimension
        ]
        # Fill the remaining 20 features with zeros
        user_input_full = user_input + [0] * 20

        # Reshape and scale user input
        user_input_scaled = scaler.transform([user_input_full])

        # Make prediction
        prediction = svm_model.predict(user_input_scaled)

        if prediction[0] == 1:  # Malignant case
            print("\nPrediction: Malignant (Cancer Detected)")
            print("Advice: You must consult a doctor and take immediate action.")
        else:  # Benign case
            print("\nPrediction: Benign (No Cancer Detected)")
            print("Advice: Routine health check-ups are sufficient.")

    except Exception as e:
        print(f"Error: {e}. Please ensure to input valid numbers.")


In [11]:
# Call prediction function
predict_cancer()


Please enter the following details:
Mean Radius value: 12.5
Mean Texture value: 20.0
Mean Perimeter value: 90.0
Mean Area value: 600.0
Mean Smoothness value: 0.12
Mean Compactness value: 0.2
Mean Concavity value: 0.3
Mean Concave Points value: 0.15
Mean Symmetry value: 0.1
Fractal Dimension value: 0.06

Prediction: Malignant (Cancer Detected)
Advice: You must consult a doctor and take immediate action.
