In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Loading the Breast Cancer Dataset from Kaggle

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
data.head(1).T

In [None]:
data.shape

the dataset contains 569 columns and 33 rows.

# Finding the number of Instances of Target Variable

In [None]:
data.diagnosis.value_counts()

thus, we see that there are two instances of breast cancer that can be found in the dataset :
<ol>
    <li><strong>Benign</strong> : 357</li>
    <li><strong>Malignant</strong> : 212</li>
</ol>

# Finding the number of Attributes

In [None]:
data.dtypes.value_counts()

thus, there are 32 numerical attributes and 1 categorical attribute

## Finding the Attributes that have object/string dtype

In [None]:
[col for col in data.select_dtypes(include = 'object')]

thus, diagnosis is the only categorical column

# Checking for Missing Values

In [None]:
data.columns[data.isnull().any()]

In [None]:
data['Unnamed: 32']

we can go ahead and drop this column as it in not related to the dataset at all, there are no other column that having missing data other than this

# Dropping irrelevant columns

In [None]:
data.drop('Unnamed: 32', axis = 1, inplace = True)
data.drop('id', axis=1,inplace = True)
data.columns

# Checking Duplicate rows

In [None]:
data.duplicated().any()

thus, there are no duplicate rows in the dataset.

# Data Scaling

In [None]:
num_cols = [col for col in data.select_dtypes(exclude = 'object').columns]
print(num_cols)

here, we found all those features that have int/float datatype

## Using MinMaxScaling method from sklearn that has values ranging from 0 to 1

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

data[num_cols] = scaler.fit_transform(data[num_cols])

data.head()

# Creating Dummy Variable for ['diagnosis'] as it is object type

In [None]:
dummies = pd.get_dummies(data['diagnosis'], drop_first = True)
print(dummies.shape)
dummies.head(2)

In [None]:
data = pd.concat([data,dummies], axis=1)

data.drop('diagnosis', axis = 1, inplace = True)
data.head()

In [None]:
data.M.value_counts()

here, 1 represents <strong>malignant</strong> and 0 represents <strong>benign</strong><br>
since, it is also the target variable, let us rename 'M' as 'target'

In [None]:
data.rename(columns={"M": "target"}, inplace = True)
data.head(2)

# Data Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Plotting the Correlation Heatmap

In [None]:
corr = data.corr(method = 'pearson')
f, ax = plt.subplots(figsize=(11, 9))
cmap = sns.diverging_palette(10, 275, as_cmap=True)

sns.heatmap(corr, cmap=cmap, square=True,
            linewidths=0.5, cbar_kws={"shrink": 0.5}, ax=ax)
plt.show()

Some features show strong correlation with each other. In order to reduce the dimensions, some features are dropped. For features radius, perimeter and area, I choose area. For concavity, concave point and compatiness, I choose concavity. For [texture_worst, texture_mean] and [area_worst, area_mean], I choose texture_mean and texture_mean. Therefore, 17 features are left.

## Dropping highly related features and keeping one of them

In [None]:
#drop those related features
drop_features = ['perimeter_mean','radius_mean','compactness_mean','concave points_mean','radius_se',
              'perimeter_se','radius_worst','perimeter_worst','compactness_worst','concave points_worst',
              'compactness_se','concave points_se','texture_worst','area_worst']
data.drop(drop_features, axis = 1, inplace = True)

In [None]:
data.head()

## Thus, we have gone through the dataset and done all the necessary Data Preprocessing required