<a href="https://colab.research.google.com/github/sakib762/Machine-Learning-Experiment/blob/main/Breast_Cancer_ML_07.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# About Dataset

**Description:**

Breast cancer is the most common cancer amongst women in the world. It accounts for 25% of all cancer cases, and affected over 2.1 Million people in 2015 alone. It starts when cells in the breast begin to grow out of control. These cells usually form tumors that can be seen via X-ray or felt as lumps in the breast area.

The key challenges against it’s detection is how to classify tumors into malignant (cancerous) or benign(non cancerous). We ask you to complete the analysis of classifying these tumors using machine learning (with SVMs) and the Breast Cancer Wisconsin (Diagnostic) Dataset.



# Importing Dependencies

In [None]:
#importing dependency
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data Collection

In [None]:
#mounting drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
#reading csv file
df = pd.read_csv("/content/drive/MyDrive/database/ML Project Database/data.csv")
df

# Data Analysing

In [None]:
#data info
df.info()

In [None]:
#statistics value
df.describe()

In [None]:
#looking for missing value
df.isnull().sum()

In [None]:
#droping unneccesary column
df = df.drop(['Unnamed: 32','id'],axis=1)

In [None]:
#checking the distribution of target variable
df['diagnosis'].value_counts()

In [None]:
df.groupby('diagnosis').mean()
#

In [None]:
#changing diagnosis colummns value to numeric
df['diagnosis'] = df['diagnosis'].map({'M':0,'B':1})

# Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
# Pairplot to visualize relationships between features
sns.pairplot(df, hue='diagnosis', vars=['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean'])
plt.show()

In [None]:

# Correlation heatmap
plt.figure(figsize=(20,20))
sns.heatmap(df.corr(), annot=True, fmt=".0%")
plt.show()

In [None]:

# Countplot for diagnosis distribution
plt.figure(figsize=(6, 4))
sns.countplot(x='diagnosis', data=df)
plt.title('Distribution of Diagnosis')
plt.show()

In [None]:

# Box plots for selected features
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean']
for feature in features:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x='diagnosis', y=feature, data=df)
    plt.title(f'Box Plot of {feature} vs. Diagnosis')
    plt.show()


# Data Spliting

In [None]:
#data spliting
X = df.drop(['diagnosis'],axis=1)
Y = df['diagnosis']

In [None]:
#train test split
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=2)
print(X_train.shape,X_test.shape,Y_train.shape,Y_test.shape)

# Model Training

In [None]:
#training model
model = LogisticRegression()
model.fit(X_train,Y_train)

# Model Evaluation

In [None]:
#accuracy score
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train,X_train_prediction)
print("Accuracy on training data : ",training_data_accuracy)

In [None]:
#accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test,X_test_prediction)
print("Accuracy on test data : ",test_data_accuracy)


# Building a Predictive System

In [None]:
#bulding predictive system
input_data = (20.57,
17.77,
132.9,
1326,
0.08474,
0.07864,
0.0869,
0.07017,
0.1812,
0.05667,
0.5435,
0.7339,
3.398,
74.08,
0.005225,
0.01308,
0.0186,
0.0134,
0.01389,
0.003532,
24.99,
23.41,
158.8,
1956,
0.1238,
0.1866,
0.2416,
0.186,
0.275,
0.08902)

input_data_as_numpy_array = np.asarray(input_data)
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)
