# Breast Cancer Data Analysis and Prediction

In [None]:
#we will start with importing the essential libraries for data preprocessing.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('display.max_columns',50)
sns.set_style('darkgrid')

In [None]:
%matplotlib inline

In [None]:
## Import the dataset

In [None]:
data=pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

In [None]:
## Let's have a look at the dataset

In [None]:
print(data.shape)
data.head()

In [None]:
# Have a look at target: diagnosis
data['diagnosis'].value_counts().plot(kind='bar')

In [None]:
## dataset has 33 columns and 569 of total rows. Data has one categorical column 'diagnosis' which infact our target column. Id is the index value column and last column is 'unnamed:32' which appers to have lots of NA values. So next it is important to check the data for missing values.

In [None]:
data.isnull().sum()

In [None]:
## As we can see only one columns 'Unnamed:32' has missing value. So we will drop this column.

In [None]:
data.drop(columns=['id', 'Unnamed: 32'],axis=1, inplace=True)

In [None]:
## Let us have a look at trends in dataset.

In [None]:
data.describe()

In [None]:
# We can notice that mean and median has difference in every column. In some column this gap is more than the others. This gap indicates that data is skewed in some sense. 
#So let's plot the distribution of column to have better look at data.

In [None]:
for features in data._get_numeric_data().columns:
    sns.distplot(data[features])
    plt.title('Skew : '+str(np.round(data[features].skew(),4)))
    plt.show()

In [None]:
## Looking at the plots, it is clearly evident that most columns are not normally distributed and appear to be skewed. Some skewness have value more than +5.0. So it is important to deal with the skewnwess of data and bring data if not to normal distrubution, then at least close to it.

In [None]:
# Two of many ways to deal with skewness are :Logarithmic transformation and Square root transformation. 

In [None]:
for features in data._get_numeric_data().columns:
    print(features, ' : ','skew in Log transformation :', np.round(np.log(data[features]).skew(),4),',', end='')
    print('\t','skew in Square root trasnformatioon', ' : ', np.round(np.sqrt(data[features]).skew(),4))

In [None]:
## Looking at above output, it is evident that logarithmic transformation has dealt with skewness in a better way than the square root transformation. But logarithmic transforamtion has some output as 'NAN'. This is becuase those columns have zeroes as value in it and log of zero is undefined. On the other hand square root transformation handles these columns well.
## So We will use Square root transormation in columns that have '0' as values and log for rest.

In [None]:
for features in data._get_numeric_data().columns:
    plt.figure(figsize=(6,6))
    if 0 in data[features].unique():
        data[features]=np.round(np.sqrt(data[features]),4)
        sns.distplot(data[features])
        plt.title('Skew : '+str(np.round(data[features].skew(),4)))
        plt.show()
    else:
        data[features]=np.round(np.log(data[features]),4)
        sns.distplot(data[features])
        plt.title('Skew : '+str(np.round(data[features].skew(),4)))
        plt.show()

In [None]:
## All the features have skewness in the range of +-0.5 which is much less than what we had earlier.

In [None]:
# Now we will look at correlation between independent variables in data.

In [None]:
plt.figure(figsize=(20,10))
sns.heatmap(data.corr(), annot=True)
plt.show()

In [None]:
## WOW, this tells a completely diffent story about data. Many features are strongly correlated and data show high level of multi-collinearity.
## One way to decrease multi-collinarity is by dropping features that show strong correlation(>0.85). So we prepare a list of columns with strong correlation value and drop them.

In [None]:
col_2_Drop=['perimeter_mean','area_mean','concavity_mean','concave points_mean','radius_se','area_se','compactness_se','concavity_se','radius_worst','area_worst','perimeter_worst','compactness_worst','concave points_worst','texture_worst']

In [None]:
plt.figure(figsize=(20,12))
sns.heatmap(data.drop(columns=col_2_Drop).corr(),annot=True)
plt.show()

In [None]:
# Much better now. Though there are fewer features than earlier but data seems promising.

In [None]:
#let's set our target variable now and prepare the for next process.

In [None]:
target=data['diagnosis']
dataset=data.drop(columns=col_2_Drop+['diagnosis'])

In [None]:
dataset.head()

In [None]:
## now that we have dealt with correlation and normality. Let's split the data set into a train and a test set.

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset,target, test_size=.25, random_state=42)

In [None]:
## Now we have a train and test. Lets scale them and bring whole data on same level.
from sklearn.preprocessing import StandardScaler
sc=StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [None]:
#Our target is a categorical data with labels M and B. We have to encode these labels as 0 and 1.
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
y_train=le.fit_transform(y_train)
y_test=le.transform(y_test)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
model_name=[]
mean_validadtion_score=[]
training_score=[]
test_accuracy_score=[]
def algorithm(models,X_train,y_train,X_test, Y_test):
    for model in models:
        model=model()
        model.fit(X_train,y_train)
        score=cross_val_score(model,X_train,y_train,cv=5)
        print('Mean cross-validation Score for',model,' is :',score.mean())
        y_pred=model.predict(X_test)
        print('Training score of ',model,' is :',model.score(X_train,y_train))
        print('Accuracy score of ',model,' is :', accuracy_score(y_test,y_pred))
        print('\n','Confusion matrix \n',confusion_matrix(y_test,y_pred))
        print('\n')
        print('-'*100)
        print('\n')
        model_name.append(model)
        mean_validadtion_score.append(np.round(score.mean(),4))
        training_score.append(np.round(model.score(X_train,y_train),4))
        test_accuracy_score.append(np.round(accuracy_score(y_test,y_pred),4))

In [None]:
models=[LogisticRegression,SVC,GaussianNB,KNeighborsClassifier,DecisionTreeClassifier,RandomForestClassifier]
algorithm(models,X_train,y_train,X_test,y_test)

In [None]:
df=pd.DataFrame()
df['Models']=[str(x) for x in model_name]
df['Mean cross-val-score']=mean_validadtion_score
df['Training Score']=training_score
df['Accuracy Score']=test_accuracy_score

In [None]:
df

### We can see the performance of different models in above dataframe.
### Linear models have consistent score in all three scoring.
### One can choose SVC or LogisticRegression and tune it with hyperparameter for further procees.