## Import Libraries

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, plot_roc_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder

import warnings
warnings.filterwarnings("ignore")


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


## Getting Know About Dataset

In [None]:
df  = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
df.head()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
#check diagnosis types count
df['diagnosis'].value_counts()

In [None]:
# Visualize how diagnosis distributed throghout the dataset'
plt.figure(figsize=(10,8.5))
sns.countplot(df['diagnosis'])
plt.show()

In [None]:
#drop unwanted columns
df.drop(['Unnamed: 32'],axis = 1,inplace=True)

In [None]:
#handle categorical variables
label = LabelEncoder()
df['diagnosis'] = label.fit_transform(df['diagnosis'])
df

## Correlation with each variables

In [None]:
plt.subplots(figsize = (40,40))
sns.heatmap(df.corr(),annot=True,fmt="f").set_title("Corelation Of Each Attributes")
plt.show()

## Pre Processing 

In [None]:
x = df.drop(['diagnosis'],axis = 1)
y = df['diagnosis']

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2, random_state=0)

In [None]:
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

## Model Bulding

In [None]:
#define functions for fit, & predict with each models
def models(mod,x_t,y_t,x_tes,y_tes,x_source,y_source):
    
    #Fit model
    mod.fit(x_t,y_t)
    
    #Predict Model
    pred = mod.predict(x_tes)
    
    #Accuracy Score
    accuracy = accuracy_score(y_tes,pred)
    
    #Cross Validation Score
    cross_validation = cross_val_score(mod,x_source,y_source,cv=5)
    print("Accuracy Is : ",accuracy*100,"%")
    
    print("-------------------------------------------")   
    
    print('Cross validations mean score ',round(np.mean(cross_validation)*100,4))
    
    print("-------------------------------------------")
    
    #Confusion Metrix
    print(confusion_matrix(y_tes, pred))
    
    print("-------------------------------------------")    
    
    #Recall Score , Percision Score, F1 Score
    print("Recall Score :",recall_score(y_tes, pred, average='weighted'))
    print("Percision Score :",precision_score(y_tes, pred, average='weighted'))
    print("F1 Score :",f1_score(y_tes, pred, average='weighted'))
    

## Logistic Regression Model

In [None]:
model = LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=123, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

models(model,x_train,y_train,x_test,y_test,x,y )

## Decision Tree Classifer Model

In [None]:
model_1 = DecisionTreeClassifier(random_state=0,criterion='gini',max_depth=None)
models(model_1,x_train,y_train,x_test,y_test,x,y )

## Random Forest Classifier Model

In [None]:
model_2 = RandomForestClassifier(n_estimators=120,random_state=0)
models(model_2,x_train,y_train,x_test,y_test,x,y )

## Extra Tree Classifier Model

In [None]:
model_3 = ExtraTreesClassifier(n_estimators= 100,random_state=0)
models(model_3,x_train,y_train,x_test,y_test,x,y )

## Support Vector Model

In [None]:
model_4 = svm.SVC()
models(model_4,x_train,y_train,x_test,y_test,x,y )

## **According to the above models Random Forest Classifier Model and support vecto model got 97% percent accuracy but Random Forrest Classifer also got 95.96% cross validation means score. so we can consider it has the best model for hyper parameter tuning**

## Hyper Parameter Tuning With Random Forest Classifier Model

In [None]:
parameters = {'criterion':('gini', 'entropy'),
              'n_estimators':[i for i in range(100,200,10)],
              'min_samples_split':[i for i in range(2,10,2)],
               'max_features':['auto', 'sqrt','log2']}

best_model = RandomForestClassifier()

clf = GridSearchCV(best_model, parameters, cv=5)
clf.fit(x_train, y_train)

In [None]:
df_grid = pd.DataFrame(clf.cv_results_)
df_grid.head(5)

## Best Parameters

In [None]:
clf.best_params_

## Best Score

In [None]:
clf.best_score_

## Best Estimators

In [None]:
clf.best_estimator_