# BREAST CANCER WISICONSIN DATA SET


## Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


## Data Loading

In [None]:
cancer_df=pd.read_csv('../input/breast-cancer/data.csv')

## Data Information

In [None]:
cancer_df.head()

In [None]:
cancer_df.tail()

In [None]:
cancer_df.info()

In [None]:
cancer_df.shape

In [None]:
cancer_df.describe().T

In [None]:
cancer_df.dtypes

In [None]:
cancer_df.diagnosis.unique()

## Preparatory Cleaning

In [None]:
cancer_df=cancer_df.drop(columns={'Unnamed: 32','id'})

In [None]:
cancer_df.diagnosis=cancer_df.diagnosis.replace('M',1);
cancer_df.diagnosis=cancer_df.diagnosis.replace('B',0);

In [None]:
cancer_df.sample(10)

In [None]:
cancer_df.isnull().sum()

In [None]:
Q1 = cancer_df.quantile(0.25)
Q3 = cancer_df.quantile(0.75)
IQR = Q3 - Q1
outliers = (cancer_df < (Q1 - 1.5 * IQR)) | (cancer_df > (Q3 + 1.5 * IQR))
print(outliers.sum())

## Exploratory Data Analysis

In [None]:
cancer_df.diagnosis.value_counts()

In [None]:
plt.pie(cancer_df['diagnosis'].value_counts())
print("\033[1;35;47m" + 'Diagnosis (M-Orange , B-Blue)' +   "\033[0m" )


In [None]:
plt.figure(figsize=(20,20))
sns.heatmap(cancer_df.corr(), annot=True)

In [None]:
cancer_df.corr()

#### High correlation shows that the variable plays a major role in Malignant Tumor

In [None]:
plt.hist(cancer_df['diagnosis'], color='c')
plt.title('Diagnosis (M=1 , B=0)')
plt.show()

In [None]:
# Scatter plot matrix with the MEAN columns
col1 = ['diagnosis','radius_mean','texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', 
        'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

sns.pairplot(data=cancer_df[col1], hue='diagnosis', palette='rocket')

Here we can see that the radius_mean, perimeter_mean, area_mean are highly linearly related and hence there is multicollinearity.
Same with concavity_mean, concave_points_mean and compactness_mean. 
This may undermine the significance of the other variables and hence we must consider only ONE of these columns.

# 1. DID NOT DROP WORST COLUMNS

In [None]:
# then, drop all columns related to the "perimeter" and "area" attributes
col2 = ['perimeter_mean', 'perimeter_se', 'area_mean', 'area_se']
cancer_df = cancer_df.drop(col2, axis=1)

# lastly, drop all columns related to the "concavity" and "concave points" attributes
col2 = ['concavity_mean', 'concavity_se', 'concave points_mean', 'concave points_se']
cancer_df = cancer_df.drop(col2, axis=1)

# verify remaining columns
cancer_df.columns

In [None]:
plt.figure(figsize=(20,20), dpi=150)
corr=cancer_df.corr()
mask=np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask=mask, cmap='BuPu', annot=True, linewidths=0.5, fmt=".2f")
plt.title('Correlation Matrix', fontsize=20, weight="semibold",color='m')
plt.show()

# Building Model

In [None]:
X=cancer_df.drop(['diagnosis'],axis=1)
y = cancer_df['diagnosis']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=40)

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()

X_train=ss.fit_transform(X_train)
X_test=ss.fit_transform(X_test)

**1. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

lr=LogisticRegression()
model1=lr.fit(X_train,y_train)
prediction1=model1.predict(X_test)

print(accuracy_score(y_test,prediction1))
print(classification_report(y_test, prediction1))

**2. Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


rfc=RandomForestClassifier()
model2 = rfc.fit(X_train, y_train)
prediction2 = model2.predict(X_test)

print(accuracy_score(y_test, prediction2))
print(classification_report(y_test, prediction2))

**3. SVM**

In [None]:
SVM = SVC()
SVM.fit(X_train, y_train)
predictions= SVM.predict(X_test)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

# 2. DROPPED FEW WORST COLUMNS

In [None]:
col2 = ['radius_worst', 
        'texture_worst', 
        'perimeter_worst', 
        'area_worst', 
        'concave points_worst']
cancer_df = cancer_df.drop(col2,axis=1)

# verify remaining columns
cancer_df.columns

In [None]:
plt.figure(figsize=(20,20), dpi=150)
corr=cancer_df.corr()
mask=np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask=mask, cmap='BuPu', annot=True, linewidths=0.5, fmt=".2f")
plt.title('Correlation Matrix', fontsize=20, weight="semibold",color='m')
plt.show()

# Building Model

In [None]:
X=cancer_df.drop(['diagnosis'],axis=1)
y = cancer_df['diagnosis']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=40)

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()

X_train=ss.fit_transform(X_train)
X_test=ss.fit_transform(X_test)

**1. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

lr=LogisticRegression()
model1=lr.fit(X_train,y_train)
prediction1=model1.predict(X_test)

print(accuracy_score(y_test,prediction1))
print(classification_report(y_test, prediction1))

**2. Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


rfc=RandomForestClassifier()
model2 = rfc.fit(X_train, y_train)
prediction2 = model2.predict(X_test)

print(accuracy_score(y_test, prediction2))
print(classification_report(y_test, prediction2))

**3. SVM**

In [None]:
SVM = SVC()
SVM.fit(X_train, y_train)
predictions= SVM.predict(X_test)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))

# 3. DROPPING ALL WORST COLUMNS

In [None]:
col2 = ['smoothness_worst', 
        'compactness_worst', 
        'concavity_worst',
        'symmetry_worst', 
        'fractal_dimension_worst']
cancer_df = cancer_df.drop(col2,axis=1)

# verify remaining columns
cancer_df.columns

In [None]:
plt.figure(figsize=(20,20), dpi=150)
corr=cancer_df.corr()
mask=np.triu(np.ones_like(corr, dtype = bool))

sns.heatmap(corr, mask=mask, cmap='BuPu', annot=True, linewidths=0.5, fmt=".2f")
plt.title('Correlation Matrix', fontsize=20, weight="semibold",color='m')
plt.show()

# Building Model

In [None]:
X=cancer_df.drop(['diagnosis'],axis=1)
y = cancer_df['diagnosis']

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=40)

In [None]:
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()

X_train=ss.fit_transform(X_train)
X_test=ss.fit_transform(X_test)

**1. Logistic Regression**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

lr=LogisticRegression()
model1=lr.fit(X_train,y_train)
prediction1=model1.predict(X_test)

print(accuracy_score(y_test,prediction1))
print(classification_report(y_test, prediction1))

**2. Random Forest**

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


rfc=RandomForestClassifier()
model2 = rfc.fit(X_train, y_train)
prediction2 = model2.predict(X_test)

print(accuracy_score(y_test, prediction2))
print(classification_report(y_test, prediction2))

**3. SVM**

In [None]:
SVM = SVC()
SVM.fit(X_train, y_train)
predictions= SVM.predict(X_test)
print(accuracy_score(y_test, predictions))
print(classification_report(y_test, predictions))