<a href="https://colab.research.google.com/github/pugalCse01/Classification-with-Logistic-Regression./blob/main/Breast_Cancer_Prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import necessary Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score, recall_score

In [None]:
df= pd.read_csv("data.csv")
df.head()

In [None]:
df.info()

# Data Cleaning and preprocessing
**our data is clean no missing values and features are correct tyes .so data cleaning is unnecessary and the dataset is small so if we remove outlier means i will reduce the dataset size and the model training is not efficient**

In [None]:
df.shape

In [None]:
df.isnull().sum()

# Drop the unnecessary columns

In [None]:
df.drop('Unnamed: 32', axis=1, inplace=True)
df.drop('id',axis=1,inplace=True)

# Data Visualizations

In [None]:
sns.countplot(x='diagnosis', data=df, palette='Set2')
plt.title("Count of Diagnosis (B = Benign, M = Malignant)")
plt.xlabel("Diagnosis")
plt.ylabel("Count")
plt.show()

In [None]:

diagnosis_counts = df['diagnosis'].value_counts(normalize=True) * 100
print(diagnosis_counts)

diagnosis_counts.plot(kind='pie', labels=['Benign (0)', 'Malignant (1)'], autopct='%1.1f%%', colors=['lightblue', 'salmon'])
plt.title("Diagnosis Distribution (%)")
plt.ylabel("")
plt.show()

In [None]:
mean_cols = [col for col in df.columns if '_mean' in col]
df_mean = df[mean_cols + ['diagnosis']]
sns.pairplot(df_mean, hue='diagnosis', corner=True, plot_kws={'alpha': 0.5, 's': 25})
plt.suptitle("Scatter Matrix of Mean Features (Colored by Diagnosis)", y=1.02)
plt.show()

In [None]:
sns.histplot(data=df, x='radius_mean', hue='diagnosis', kde=True, palette='Set1', bins=30)
plt.title("Distribution of Radius Mean by Diagnosis")
plt.show()

In [None]:

features_to_plot = ['radius_mean', 'texture_mean', 'area_mean', 'smoothness_mean']
plt.figure(figsize=(15, 12))
for i, feature in enumerate(features_to_plot, 1):
    plt.subplot(2, 2, i)  # 2x2 grid
    sns.histplot(data=df, x=feature, hue='diagnosis', kde=True, palette="coolwarm", element="step", stat="density")
    plt.title(f'Distribution of {feature} by Diagnosis')

plt.tight_layout()
plt.show()

In [None]:
import plotly.express as px
numeric_df = df.select_dtypes(include=['number'])

# Compute correlation matrix
corr_matrix = numeric_df.corr().round(2)

# Create interactive heatmap
fig = px.imshow(
    corr_matrix,
    text_auto=True,
    color_continuous_scale='RdBu_r',
    aspect='auto',
    title="Interactive Correlation Heatmap"
)
fig.update_layout(width=1000, height=800)
fig.show()

# Drop the unnecessary columns which are not useful for training
**for identify the unrelevent colums we use the coorelation matrix**

In [None]:
drop_columns = [
    'perimeter_mean', 'area_mean',
    'concavity_mean', 'concave points_mean',
    'radius_worst', 'texture_worst', 'perimeter_worst', 'area_worst',
    'smoothness_worst', 'compactness_worst', 'concavity_worst',
    'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst',
    'perimeter_se', 'area_se'
]

df = df.drop(drop_columns, axis=1)

In [None]:
df.head()

In [None]:
X = df.drop('diagnosis', axis=1)
y = df['diagnosis']


# Split the dataset into train and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# fit the train data in the Logistic Regression model

In [None]:
# Create and train the model

model=LogisticRegression(solver='liblinear')
model.fit(X_train,y_train)

# predict the unseen data using the trained Logistic model

In [None]:
y_pred = model.predict(X_test)

In [None]:
print(y_pred)

# evaluate the model
**evaluate the model using precision recall and roc-curve .our model will have the accuracy of 92**

In [None]:
print(accuracy_score(y_test,y_pred))

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = confusion_matrix(y_test, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Benign', 'Malignant'])
plt.figure(figsize=(5, 4))
disp.plot(cmap=plt.cm.Blues, values_format='d')
plt.title('Confusion Matrix')
plt.grid(False)
plt.show()

In [None]:
precision = precision_score(y_test, y_pred, pos_label='M')
recall = recall_score(y_test, y_pred, pos_label='M')
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, roc_auc_score
y_test_binary = (y_test == 'M').astype(int)
y_pred_probs = model.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(y_test_binary, y_pred_probs)
roc_auc = roc_auc_score(y_test_binary, y_pred_probs)
plt.figure(figsize=(6, 5))
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=1, linestyle='--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()


# Thresold Tuning

In [None]:
import numpy as np
from sklearn.metrics import precision_score, recall_score
thresholds = np.arange(0.0, 1.01, 0.05)
print("Threshold | Precision | Recall")
for t in thresholds:
    y_pred_class = (y_pred_probs > t).astype(int)
    precision = precision_score(y_test_binary, y_pred_class)
    recall = recall_score(y_test_binary, y_pred_class)
    print(f"{t:.2f}       | {precision:.2f}      | {recall:.2f}")



# Sigmoid Function

**A logistic regression model is designed to output values between 0 and 1, which can be interpreted as probabilities. This is achieved by using a special mathematical function called the sigmoid function, also known as the logistic function.**

### 📘 Sigmoid Function Formula

The sigmoid function is defined as:

\[
\sigma(z) = \frac{1}{1 + e^{-z}}
\]
