## Exploratory Data Analysis

In [None]:
## Data Analysis Phase
## Main aim is to understand more about the data

import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
## Display all columns of the dataframe

pd.pandas.set_option('display.max_columns', None)

In [None]:
dataset = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')

## print shape of the dataset
print(dataset.shape)

In [None]:
dataset.head()

In [None]:
dataset.drop(['id', 'Unnamed: 32'], axis=1, inplace = True)

In [None]:
## Here we will check the percentage of missing values in each feature
## Step - 1: make the list of features which have missing values

features_with_na = [features for features in dataset.columns if dataset[features].isnull().sum()>1]

## Step - 2: Print the feature name and the percentage of missing values
for feature in features_with_na:
    print(feature, np.round(dataset[feature].isnull().mean(), 4), '%missing values')

In [None]:
## NO missing values available in the dataset

In [None]:
## LabelEncoding (Convert the value of M and N into 1 and 0)
from sklearn.preprocessing import LabelEncoder
labelEncoder_y = LabelEncoder()
dataset.iloc[:, 0] = labelEncoder_y.fit_transform(dataset.iloc[:, 0].values)

In [None]:
dataset.head()

In [None]:
continous_features = dataset.drop(['diagnosis'], axis=1)
continous_features.head()

In [None]:
## lets analyze the continous values by creating histogram to understand the distribution
for feature in continous_features:
    data=dataset.copy()
    data[feature].hist(bins=25)
    plt.xlabel(feature)
    plt.ylabel("count")
    plt.title(feature)
    plt.show()

In [None]:
 ## The data is not distributed Normally

## Outliers

In [None]:
## Check and removing Outliers
for feature in continous_features:
    data=dataset.copy()
    if 0 in data[feature].unique():
        pass
    else:
        data[feature]=np.log(data[feature])
        data.boxplot(feature)
        plt.ylabel(feature)
        plt.title(feature)
        plt.show()

In [None]:
### There is so many outliers

# Feature Selection

## Correlation

In [None]:
corr = dataset.corr()
plt.figure(figsize=(20, 20))
sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot= True, annot_kws={'size':15}, cmap='GnBu')
plt.show()

### Data Preprocessing

In [None]:
dataprocessed = dataset.drop(['diagnosis'], axis=1)

In [None]:
dataprocessed.head()

In [None]:
corr = dataprocessed.corr()
plt.figure(figsize=(15, 15))
sns.heatmap(corr, cbar=True, square=True, fmt='.1f', annot= True, annot_kws={'size':10}, cmap='GnBu')
plt.show()

In [None]:
droplist = ['radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 
           'concave points_mean', 'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'texture_worst', 'perimeter_worst', 
           'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst']
dataprocessed = dataprocessed.drop(droplist, axis=1)

In [None]:
dataprocessed.head()

In [None]:
for feature in dataprocessed.columns:
    sns.displot(dataprocessed[feature])

In [None]:
def outlierLimit(column):
    q1, q3 = np.nanpercentile(column, [25, 75])
    iqr = q3 - q1
    
    uplimit = q3 + 1.5*iqr
    lowlimit = q1 - 1.5*iqr
    return uplimit, lowlimit

In [None]:
for column in dataprocessed.columns:
    if dataprocessed[column].dtype != 'object':
        uplimit, lowlimit = outlierLimit(dataprocessed[column])
        dataprocessed[column] = np.where((dataprocessed[column]>uplimit) | (dataprocessed[column]<lowlimit), np.nan, dataprocessed[column])

In [None]:
dataprocessed.isnull().sum()

In [None]:
## Now you can see we change outliers into Nan values

In [None]:
from sklearn.impute import KNNImputer
imputer = KNNImputer(n_neighbors=4)
dataprocessed.iloc[:, :] = imputer.fit_transform(dataprocessed)


In [None]:
dataprocessed.isnull().sum()

In [None]:
dataprocessed.head()

# Model Training and Testing

In [None]:
y = dataset['diagnosis']
X = dataprocessed

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
def models(X_train, y_train):
    ## LogisticRegression 
    lr = LogisticRegression(random_state=42)
    lr.fit(X_train, y_train)
    
    ## DecisionTreeClassifier
    from sklearn.tree import DecisionTreeClassifier
    tree = DecisionTreeClassifier(random_state=42, criterion='entropy')
    tree.fit(X_train, y_train)
    
    ##  Random Forest
    from sklearn.ensemble import RandomForestClassifier
    forest = RandomForestClassifier(random_state=42, criterion='entropy', n_estimators = 10)
    forest.fit(X_train, y_train)
    
    print('[0]LogisticRegression Accuracy: ', lr.score(X_train, y_train))
    print('[0]DecisionTreeClassifier Accuracy: ', tree.score(X_train, y_train))
    print('[0]Random Forest Accuracy: ', forest.score(X_train, y_train))
    return lr, tree, forest


In [None]:
model = models(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, recall_score

for i in range(len(model)):
    print("Model",i)
    print(classification_report(y_test, model[i].predict(X_test)))
    print(accuracy_score(y_test, model[i].predict(X_test)))
    print(recall_score(y_test, model[i].predict(X_test)))

In [None]:
# print("Accuracy: {}".format(accuracy_score(y_test, y_pred)))
# print('Recall: {}'.format(recall_score(y_test, y_pred)))