# CASE STUDY: BREAST CANCER CLASSIFICATION


# STEP #1: PROBLEM STATEMENT


- Predicting if the cancer diagnosis is benign or malignant based on several observations/features 
- 30 features are used, examples:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry 
        - fractal dimension ("coastline approximation" - 1)

- Datasets are linearly separable using all 30 input features
- Number of Instances: 569
- Class Distribution: 212 Malignant, 357 Benign
- Target class:
         - Malignant
         - Benign


https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)



# STEP #2: IMPORTING DATA

In [None]:
# import libraries 
import pandas as pd # Import Pandas for data manipulation using dataframes
import numpy as np # Import Numpy for data statistical analysis 
import matplotlib.pyplot as plt # Import matplotlib for data visualisation
import seaborn as sns # Statistical data visualization
# %matplotlib inline

In [None]:
# Import Cancer data drom the Sklearn library
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [None]:
cancer

In [None]:
cancer.keys()

In [None]:
print(cancer['DESCR'])

In [None]:
print(cancer['target_names'])

In [None]:
print(cancer['target'])

In [None]:
print(cancer['feature_names'])

In [None]:
print(cancer['data'])


In [None]:
cancer['data'].shape

In [None]:
df_cancer = pd.DataFrame(np.c_[cancer['data'], cancer['target']], columns = np.append(cancer['feature_names'], ['target']))


In [None]:
df_cancer.head()

In [None]:
df_cancer.tail()

In [None]:
x = np.array([1,2,3])
x.shape

In [None]:
Example = np.c_[np.array([1,2,3]), np.array([4,5,6])]
Example.shape

# STEP #3: VISUALIZING THE DATA

In [None]:
sns.pairplot(df_cancer, hue = 'target', vars = ['mean radius', 'mean texture', 'mean area', 'mean perimeter', 'mean smoothness'] )

In [None]:
sns.countplot(df_cancer['target'], label = "Count") 

In [None]:
sns.scatterplot(x = 'mean area', y = 'mean smoothness', hue = 'target', data = df_cancer)


In [None]:
#sns.lmplot('mean area', 'mean smoothness', hue ='target', data = df_cancer_all, fit_reg=False)

In [None]:
# Let's check the correlation between the variables 
# Strong correlation between the mean radius and mean perimeter, mean area and mean primeter
plt.figure(figsize=(20,10)) 
sns.heatmap(df_cancer.corr(), annot=True) 

# STEP #4: MODEL TRAINING (FINDING A PROBLEM SOLUTION)

In [None]:

# Let's drop the target label coloumns
X = df_cancer.drop(['target'],axis=1)


In [None]:
X

In [None]:
y = df_cancer['target']
y

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state=5)

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix

svc_model = SVC()
svc_model.fit(X_train, y_train)

# STEP #5: EVALUATING THE MODEL

In [None]:
y_predict = svc_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)

In [None]:
cm

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test, y_predict))

# STEP #6: IMPROVING THE MODEL

In [None]:
min_train = X_train.min()
min_train

In [None]:
range_train = (X_train - min_train).max()
range_train

In [None]:
X_train_scaled = (X_train - min_train)/range_train

In [None]:
X_train_scaled

In [None]:
sns.scatterplot(x = X_train['mean area'], y = X_train['mean smoothness'], hue = y_train)

In [None]:
sns.scatterplot(x = X_train_scaled['mean area'], y = X_train_scaled['mean smoothness'], hue = y_train)

In [None]:
min_test = X_test.min()
range_test = (X_test - min_test).max()
X_test_scaled = (X_test - min_test)/range_test

In [None]:
from sklearn.svm import SVC 
from sklearn.metrics import classification_report, confusion_matrix

svc_model = SVC()
svc_model.fit(X_train_scaled, y_train)

In [None]:
y_predict = svc_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_predict)

sns.heatmap(cm,annot=True,fmt="d")

In [None]:
print(classification_report(y_test,y_predict))

# IMPROVING THE MODEL - PART 2

In [None]:
param_grid = {'C': [0.1, 1, 10, 100], 'gamma': [1, 0.1, 0.01, 0.001], 'kernel': ['rbf']} 

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=4)

In [None]:
grid.fit(X_train_scaled,y_train)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid_predictions = grid.predict(X_test_scaled)

In [None]:
cm = confusion_matrix(y_test, grid_predictions)

In [None]:
sns.heatmap(cm, annot=True)

In [None]:
print(classification_report(y_test,grid_predictions))

In [None]:
#So the model predict 97% accuracy