In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Basic Computing Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
from keras.models import Sequential
from keras.layers import Dense  
from keras.layers import Dropout
from sklearn import metrics
import warnings
warnings.filterwarnings("ignore")

In [None]:
dataset = pd.read_csv('../input/breast-cancer-wisconsin-prognostic-data-set/data 2.csv')

In [None]:
dataset.head()

In [None]:
dataset.columns

In [None]:
labels = dataset['diagnosis'].values
labels

In [None]:
dataset.info()

**There aren't any categorical values in the dataset**

In [None]:
dataset.describe()

In [None]:
dataset.shape

In [None]:
dataset.corr()

**Feature-Engineering - Selection + Extraction + Visualization**

In [None]:
numerical_columns = [col for col in dataset.columns if dataset[col].dtype != 'object']
dataset_copy_num = dataset[numerical_columns]
var_threshold = VarianceThreshold(threshold=0)
var_threshold.fit(dataset_copy_num)

In [None]:
var_threshold.get_support()

In [None]:
dataset_copy_num.columns[var_threshold.get_support()]

In [None]:
dataset_copy = dataset.copy()
unnamed_cols = [col for col in dataset_copy_num.columns if not col in dataset_copy_num.columns[var_threshold.get_support()]]
to_be_dropped = []
unnamed_cols

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(dataset_copy.drop(['Unnamed: 32', 'id', 'diagnosis'], axis=1), 
                                                      dataset_copy[['diagnosis']], 
                                                      test_size=0.3, 
                                                      random_state=42)

In [None]:
X_train

In [None]:
y_train

In [None]:
dataset_cors = X_train.corr()
fig, ax = plt.subplots(figsize=(18, 18))
sns.heatmap(dataset_cors, annot=True, ax=ax)

In [None]:
def correlation(dataset, threshold):
  col_corr = set()
  corr_matrix = dataset.corr()
  for i in range(len(corr_matrix.columns)):
    for j in range(i):
      if (corr_matrix.iloc[i, j] > threshold):
        col_corr.add(corr_matrix.columns[i])
  return col_corr

In [None]:
returned_cols = correlation(X_train, 0.8)
len(set(returned_cols))

In [None]:
len(X_train.columns)

In [None]:
X_train_new = X_train.drop(returned_cols, axis=1)
X_valid_new = X_valid.drop(returned_cols, axis=1)

In [None]:
#We also need to label encode the label set
from sklearn.preprocessing import LabelEncoder 
le = LabelEncoder()
le.fit_transform(y_train)

In [None]:
#Since we are using tree-based algorithms, we won't be applying Standardization as of now
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier

def fine_tune_params(classifier, parameters, X_train, y_train):
  rm = RandomizedSearchCV(estimator=classifier, param_distributions=parameters, n_iter=8, n_jobs=-1, cv=9)
  search = rm.fit(X_train, y_train)
  bs = search.best_score_
  bp = search.best_params_
  return bs, bp

In [None]:
classifier=RandomForestClassifier()

#Specifying the parameters for classification
n_estimators=[int(x) for x in np.linspace(20, 2000, 11)]
criterion=['gini', 'entropy']

bootstrap = ['True', 'False']
max_depth=[int(x) for x in np.linspace(10, 100, 11)]
max_depth.append(None)

min_sample_split=[2, 5, 10]
max_features=['auto', 'sqrt']
min_samples_leaf=[1, 4, 9]

parameters = {
                'n_estimators': n_estimators,
                'criterion': criterion,
                'max_depth': max_depth,
                'max_features': max_features,
                'min_samples_leaf': min_samples_leaf,
                'min_samples_split': min_sample_split,
                'bootstrap': bootstrap
            }
best_score, best_parameters = fine_tune_params(classifier, parameters, X_train_new, y_train)

In [None]:
print(best_score)
print(best_parameters )

In [None]:
classifier = RandomForestClassifier(n_estimators=416, min_samples_split=10, min_samples_leaf=1, max_features='auto', max_depth=None, criterion='gini', bootstrap='True')
classifier.fit(X_train_new, y_train)

In [None]:
predictions = classifier.predict(X_valid_new)

#generating metrics report
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
confusion_matrix(y_valid, predictions)
accuracy_score(y_valid, predictions)

#true_predictions = le.inverse_transform()
target_names = ['M', 'B']
print(classification_report(y_valid, predictions, target_names=target_names))