<a href="https://colab.research.google.com/github/sridevi2207/Github1/blob/main/Random_forest_classification_and_regression_for_breast_cancer_diagnosis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!git clone https://github.com/sridevi2207/miniproject1.git

Cloning into 'miniproject1'...


In [None]:
# Importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt


In [None]:
# Reading the dataset
df = pd.read_csv('data.csv')


In [None]:
# Dropping unnecessary column
df = df.drop(columns='Unnamed: 32', axis=1)


In [None]:
# Mapping 'diagnosis' column: M -> 1 (Malignant), B -> 0 (Benign)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})



In [None]:
# Dropping 'id' column as it's not useful for prediction
df = df.drop(columns='id', axis=1)

In [None]:
# Creating lists for different sets of features
mean_features = list(df.columns[1:11])
se_features = list(df.columns[11:21])
worst_features = list(df.columns[21:31])


In [None]:
# Adding 'diagnosis' column to each list for correlation analysis
mean_features.append('diagnosis')
se_features.append('diagnosis')
worst_features.append('diagnosis')


In [None]:
# Calculating correlation matrix for worst_features
corr = df[worst_features].corr()
corr


Unnamed: 0,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis
radius_worst,1.0,0.359921,0.993708,0.984015,0.216574,0.47582,0.573975,0.787424,0.243529,0.093492,0.776454
texture_worst,0.359921,1.0,0.365098,0.345842,0.225429,0.360832,0.368366,0.359755,0.233027,0.219122,0.456903
perimeter_worst,0.993708,0.365098,1.0,0.977578,0.236775,0.529408,0.618344,0.816322,0.269493,0.138957,0.782914
area_worst,0.984015,0.345842,0.977578,1.0,0.209145,0.438296,0.543331,0.747419,0.209146,0.079647,0.733825
smoothness_worst,0.216574,0.225429,0.236775,0.209145,1.0,0.568187,0.518523,0.547691,0.493838,0.617624,0.421465
compactness_worst,0.47582,0.360832,0.529408,0.438296,0.568187,1.0,0.892261,0.80108,0.614441,0.810455,0.590998
concavity_worst,0.573975,0.368366,0.618344,0.543331,0.518523,0.892261,1.0,0.855434,0.53252,0.686511,0.65961
concave points_worst,0.787424,0.359755,0.816322,0.747419,0.547691,0.80108,0.855434,1.0,0.502528,0.511114,0.793566
symmetry_worst,0.243529,0.233027,0.269493,0.209146,0.493838,0.614441,0.53252,0.502528,1.0,0.537848,0.416294
fractal_dimension_worst,0.093492,0.219122,0.138957,0.079647,0.617624,0.810455,0.686511,0.511114,0.537848,1.0,0.323872


In [None]:
# Selecting important features based on analysis
predicted_vars = ['radius_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean',
                  'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'area_worst',
                  'compactness_worst', 'concavity_worst']


In [None]:
# Importing necessary libraries for model evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score, accuracy_score, f1_score
from sklearn.model_selection import cross_val_score


In [None]:
# Splitting data into train and test sets (80% train, 20% test)
train, test = train_test_split(df, test_size=0.2, random_state=1)


In [None]:
# Separating features and target variable
train_x = train[predicted_vars]
train_y = train['diagnosis']
test_x = test[predicted_vars]
test_y = test['diagnosis']


In [None]:
# Importing RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

# Initializing and training the model
model = RandomForestClassifier()
model.fit(train_x, train_y)


In [None]:
# Making predictions on the test set
predictions = model.predict(test_x)


In [None]:
# Evaluating model using confusion matrix
confusion_matrix(test_y, predictions)

# Calculating evaluation metrics
accuracy = accuracy_score(test_y, predictions)
print("The accuracy is %0.2f" % accuracy)


The accuracy is 0.95


In [None]:
precision = precision_score(test_y, predictions)
print("The precision is %0.2f" % precision)

recall = recall_score(test_y, predictions)
print("The recall is %0.2f" % recall)

f1 = f1_score(test_y, predictions)
print("The f1 score is %0.2f" % f1)

# Cross-validation to evaluate model stability
X = df.drop('diagnosis', axis=1)


The precision is 0.97
The recall is 0.88
The f1 score is 0.93


In [None]:
y = df['diagnosis']

scores = cross_val_score(model, X, y, cv=5)
print(scores)
print("Average Score:", scores.mean())


[0.93859649 0.94736842 0.98245614 0.98245614 0.98230088]
Average Score: 0.9666356155876417


In [None]:
#Checking with Sample datasets
# Importing numpy library
import numpy as np


In [None]:
# Creating sample input data with 5 examples (each having 12 features)
sample_data = np.array([
    [14.2, 590.0, 0.09, 0.135, 0.06, 0.45, 1.2, 3.0, 16.0, 790.0, 0.12, 0.28],
    [13.1, 520.0, 0.08, 0.12, 0.055, 0.4, 1.0, 2.5, 15.0, 680.0, 0.11, 0.25],
    [15.6, 670.0, 0.1, 0.14, 0.065, 0.5, 1.3, 3.5, 17.5, 870.0, 0.13, 0.3],
    [12.8, 480.0, 0.085, 0.115, 0.05, 0.35, 0.9, 2.0, 14.0, 600.0, 0.1, 0.22],
    [16.3, 720.0, 0.11, 0.145, 0.07, 0.55, 1.4, 4.0, 18.0, 910.0, 0.14, 0.32]
])


In [None]:
# Converting numpy array into a pandas DataFrame with feature names
sample_df = pd.DataFrame(sample_data, columns=predicted_vars)

# Making predictions using the trained Random Forest model
predictions = model.predict(sample_df)

# Displaying the prediction results
for i, pred in enumerate(predictions):
    # If pred == 1 -> Malignant, else -> Benign
    result = "Malignant" if pred == 1 else "Benign"
    print(f"Sample {i+1}: {result}")


Sample 1: Benign
Sample 2: Benign
Sample 3: Malignant
Sample 4: Benign
Sample 5: Malignant


In [None]:
#Predicting Severity Values for Each Data Entry

# Importing required libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from google.colab import files


In [None]:
# 1. Read the dataset from a CSV file
df = pd.read_csv('data.csv')

# 2. Map diagnosis (M -> 1, B -> 0) to numerical values for easier processing
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})


In [None]:
# 1. Read the dataset from a CSV file
df = pd.read_csv('data.csv')

# 2. Map diagnosis (M -> 1, B -> 0) to numerical values for easier processing
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})


In [None]:
# 3. Select important features related to severity calculation
severity_features = ['radius_mean', 'area_mean', 'perimeter_mean',
                     'concavity_mean', 'concave points_mean',
                     'area_worst', 'radius_worst', 'perimeter_worst']

# 4. Normalize the features (scaling them to range [0, 1])
scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(df[severity_features])

# 5. Calculate severity scores by averaging the normalized features
severity_scores = scaled_features.mean(axis=1)


In [None]:
# 6. Convert the severity scores to percentages (rounded to 2 decimal places)
df['Severity (%)'] = (severity_scores * 100).round(2)

# 7. Optionally boost severity slightly for malignant cases (add 10% for M cases)
df.loc[df['diagnosis'] == 1, 'Severity (%)'] += 10


In [None]:
# 8. Ensure that no severity score exceeds 100% by clipping values
df['Severity (%)'] = df['Severity (%)'].clip(upper=100)

# 9. Save the updated dataframe to a new CSV file
df.to_csv('updated_data_1.csv', index=False)

# 10. Download the updated CSV file to the local machine
files.download('updated_data_1.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
#Prediction of Severity Score Based on Input Data


import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score


In [None]:
# 1. Load dataset
df = pd.read_csv('updated_data_1.csv')

# 2. Select the features used for severity calculation
features = ['radius_mean', 'area_mean', 'perimeter_mean',
            'concavity_mean', 'concave points_mean',
            'area_worst', 'radius_worst', 'perimeter_worst']

X = df[features]
y = df['Severity (%)']

# 3. Split data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Train the model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



In [None]:
# 5. Evaluate the model
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f" Mean Absolute Error: {mae:.2f}")
print(f" R² Score: {r2:.2f}")




 Mean Absolute Error: 1.27
 R² Score: 0.99


In [None]:
#Predict severity for some sample data’s
import pandas as pd
# Use same column name s as training features
feature_names = ['radius_mean', 'area_mean', 'perimeter_mean',
                 'concavity_mean', 'concave points_mean',
                 'area_worst', 'radius_worst', 'perimeter_worst']


In [None]:
# Sample input as DataFrame
new_data_df = pd.DataFrame([[17.99, 1001.0, 122.8, 0.3001, 0.1471, 2019.0, 28.11, 184.6]], columns=feature_names)

# Predict without warning
predicted_severity = model.predict(new_data_df)[0]
print(f" Predicted Severity: {predicted_severity:.2f}%")


 Predicted Severity: 63.27%
