In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## AIM :
    To come up with the most accurate model to predict brest cancer type.

### Data Information
    1. Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. They describe characteristics of the cell nuclei present in the image. A few of the images can be found at [Web Link]

    2. Separating plane described above was obtained using Multisurface Method-Tree (MSM-T) [K. P. Bennett, "Decision Tree Construction Via Linear Programming." Proceedings of the 4th Midwest Artificial Intelligence and Cognitive Science Society, pp. 97-101, 1992], a classification method which uses linear programming to construct a decision tree. Relevant features were selected using an exhaustive search in the space of 1-4 features and 1-3 separating planes.

    3. The actual linear program used to obtain the separating plane in the 3-dimensional space is that described in: [K. P. Bennett and O. L. Mangasarian: "Robust Linear Programming Discrimination of Two Linearly Inseparable Sets", Optimization Methods and Software 1, 1992, 23-34].

    4. This database is also available through the UW CS ftp server:
    ftp ftp.cs.wisc.edu
    cd math-prog/cpo-dataset/machine-learn/WDBC/


### Features Information
1. **ID NUMBER**<br>
    represents id no of patient , which help to identify them easily
    
2. **Diagnosis**: <br>
    a catogerical feature having two vlaue :<br>
    **M** = malignant<br>
    **B** = benign<br>
<br>**Ten real-valued features are computed for each cell nucleus:**<br>
3. **radious_mean**:<br>
    Mean of distances from center to points in the perimeter for each cell(**cell might not be circular**)<br><br>
4. **texture_mean**:<br>
    standard deviation of gray-scale values<br><br>
5. **perimeter_mean**:<br>
    mean size of core tumor<br><br>
6. **area_mean**:<br>
    area of tumor cell<br><br>
7. **smoothness_mean**:<br>
    mean of local variation in radious lengths,basically measures how circular/spharical the tumor cell is<br><br>
8. **compactness_mean**:<br>
    mean of parimeter <sup>2 </sup> / area - 1.0 <br><br>
    
9. **concavity_mean**:<br>
    mean of severity of concave portions the contour<br><br>
10. **concave_points**:<br>
    number of concave portions of the contour
    <br><br>
11. **fractal dimension**:<br> (
    "coastline approximation" - 1
    <br><br>
**The mean, standard error and "worst" or largest (mean of the three
largest values) of these features were computed for each image,
resulting in 30 features. For instance, field 3 is Mean Radius, field
13 is Radius SE, field 23 is Worst Radius.**


In [None]:
#importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline


##loading the data
pd.set_option('display.max_columns', None)
df = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

df.head()

In [None]:
df.shape

In [None]:
df.isna().sum()

In [None]:
df.drop('Unnamed: 32',axis=1,inplace=True)
df.head()

In [None]:
df.shape

In [None]:
df.info()

**Important initial insights**
<br>
1. id can't be used for classification
2. diagnosis is our lebel column for classification
3. unnamed32 is useless and we have already droped that **cheers**.
4. other features --- we need to know about them

In [None]:
## data visualization
sns.countplot(x='diagnosis',data=df);

there are more Benign then Malignent ....... wait ti's not we need to find<br>
Let's first do some data exploration

In [None]:
y = df['diagnosis']
x = df[['radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst']];

In [None]:
##let's see range of each feature so that we can decide to go for standard scaling 
x.describe()

ohh!! features are scalled , we are going to perform standard scalling, so that it dont make any biasness in plot and model as well


In [None]:
x_scalled = (x - x.mean()) / (x.std()) 

In [None]:
##let's focus on first 10 features feature_mean
data = pd.concat([y,x_scalled.iloc[:,0:10]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value');
plt.figure(figsize=(10,10));
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart");
plt.xticks(rotation=90);


In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90);

**interpretation Example** of above plot data

in **texture_mean, parameter_mean,area_mean, concavity_mean** features, median of the Malignant and Benign looks like separated so it can be good for classification.
<br>
However, in fractal_dimension_mean feature, median of the Malignant and Benign does not looks like separated so it does not gives good information for classification.

**Now let's go for next 10 features**

In [None]:
data = pd.concat([y,x_scalled.iloc[:,10:20]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value');
plt.figure(figsize=(10,10));
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart");
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90);

In [None]:
data = pd.concat([y,x_scalled.iloc[:,20:]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value');
plt.figure(figsize=(10,10));
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart");
plt.xticks(rotation=90);

In [None]:
plt.figure(figsize=(10,10))
sns.boxplot(x="features", y="value", hue="diagnosis", data=data)
plt.xticks(rotation=90);

In [None]:
f,ax = plt.subplots(figsize=(18, 18))
sns.heatmap(x.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax,cmap='YlGnBu')

**feature**&emsp; &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;**having very high correlation or dependacy with** <br>
radius_mean &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; parameter_mean, area_mean, rdius_worst, parameter_worst, _area_worst_ &emsp;&emsp; we will be using **area_mean** in our model<br>
texture_mean&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; texture_worst<br>
radius_worst &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; perimeter_worst and area_worst &emsp;&emsp; i will use area_worst

radius_se &emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp;&emsp; parameter_se, area_se &emsp;&emsp; we will be using **area_se** in our model<br>
Compactness_mean, concavity_mean and concave points_mean

**im not writing for all features but u can follow this**

In [None]:
drop_list1 = ['perimeter_mean',
              'radius_mean',
              'compactness_mean',
              'concave points_mean',
              'radius_se','perimeter_se',
              'radius_worst','perimeter_worst',
              
              'texture_worst',
              'area_worst']


In [None]:
x_1 = x.drop(drop_list1,axis = 1 )     ##selected features after one interpolation
x_1.head()

In [None]:
f,ax = plt.subplots(figsize=(14, 14))
sns.heatmap(x_1.corr(), annot=True, linewidths=.5, fmt= '.1f',ax=ax ,cmap='YlGnBu');

In [None]:
#let's fit a model and find out how well we selected features
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import accuracy_score

# split data train 70 % and test 30 %
x_train, x_test, y_train, y_test = train_test_split(x_1, y, test_size=0.3)

#random forest classifier with n_estimators=10 (default)
clf_rf = RandomForestClassifier()      
clr_rf = clf_rf.fit(x_train,y_train)

ac = accuracy_score(y_test,clf_rf.predict(x_test))
print('Accuracy is: ',ac)
cm = confusion_matrix(y_test,clf_rf.predict(x_test))
sns.heatmap(cm,annot=True,fmt="d")