## Breast Cancer Prediction
#### In this notebook, we will try classify and predict whether it is a Malignant(M) or a Benign(B) based on the features we provide to the model.

## Importing the libraries.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

## Let's load the data and explore it.

In [None]:
breast_data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
breast_data.head()

### Deleting unneccesary columns.

In [None]:
breast_data = breast_data.drop(['id', 'Unnamed: 32'], axis  = 1)
breast_data.head()

In [None]:
breast_data.describe()

### Changing M and B in diagnosis column to 1 and 0 respectively.

In [None]:
breast_data['diagnosis'] = np.where((breast_data['diagnosis'] == 'M'), 1, 0)
breast_data.head()

In [None]:
breast_data['diagnosis'].unique()

### Now our dataset has only numerical values.

## Now we will split the data for visualization purpose.

In [None]:
cancer_M = breast_data[breast_data['diagnosis']==1]
cancer_B = breast_data[breast_data['diagnosis']==0]

In [None]:
features_mean=list(breast_data.columns[1:11])
fig, axes = plt.subplots(nrows=5, ncols=2, figsize=(10,13))
axes = axes.ravel()
for idx,ax in enumerate(axes):
    ax.figure
    ax.hist([cancer_M[features_mean[idx]],cancer_B[features_mean[idx]]], bins = 50, alpha=0.8, stacked=True, label=['M','B'], color=['red','blue'])
    ax.legend(loc='upper right')
    ax.set_title(features_mean[idx])
plt.tight_layout()
plt.show()

### Take a look at these plots before reading the next part.
As you may have noticed, Red colour corresponds to Malignant and Blue to Benign. There are features that can differentiate and classify whether it is M or B easily whereas as features which might not help us classify properly.

#### Features that can help us:
    1] radius_mean
    2] perimeter_mean
    3] area_mean
    4] compactness_mean
    5] concavity_mean
    6] concave points_mean
#### Features that might not be as useful:
    1] texture_mean
    2] smoothness_mean
    3] symmetry_mean
    4] fractal_dimension_mean

In [None]:
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

## Splitting dataset into two viz. training set and testing set.

In [None]:
train, test = train_test_split(breast_data, test_size = 0.2, random_state = 0)

### Function that takes type of model, training set, testing set, features & target as arguments and prints Accuracy.

In [None]:
def classification_model(model, train, test, features, target):
    model.fit(train[features], np.ravel(train[target]))
    pred = model.predict(test[features])
  
    accuracy = accuracy_score(pred ,test[target])
    print("Accuracy : %s" % "{0:.3%}".format(accuracy))

## Models based on Logistic Regression
#### Uses only one feature:

In [None]:
features = ['radius_mean']
target = ['diagnosis']

model_logistic = LogisticRegression()
classification_model(model_logistic, train, test, features, target)

#### Uses all the mean valued features:

In [None]:
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean','concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']
target = ['diagnosis']

model_logistic_2 = LogisticRegression()
classification_model(model_logistic_2 , train, test, features, target)

## Models based on Random Forest Classifier 
#### Uses features that we shortlisted by observing the plots above:

In [None]:
features = ['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean','concave points_mean', 'concavity_mean']
target = ['diagnosis']

model_random = RandomForestClassifier(random_state=0)
classification_model(model_random, train, test, features, target)

#### Model based on Random Forest Classifier (uses all the mean valued features):

In [None]:
features = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
                 'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean']

model_random_2 = RandomForestClassifier(random_state=0)
classification_model(model_random_2, train, test, features, target)

#### Random Forest Classifier has a attribute called 'feature_importances_'. It tells us which feature are important in making the decision i.e. in this case M or B.

In [None]:
imp_features = pd.Series(model_random_2.feature_importances_, index=features).sort_values(ascending=False)
imp_features

### Now we will only use top 7 features according to feature_importances_ attribute.

In [None]:
features = imp_features.index[:7]
target = ['diagnosis']

model_random_3 = RandomForestClassifier(random_state=0)
classification_model(model_random_3, train, test, features, target)