In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('../input/breast-cancer-wisconsin-data/data.csv')
data.head()

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
data.head()

In [None]:
data.columns

In [None]:
data.drop('Unnamed: 32', axis=1, inplace=True)

In [None]:
data.head()

In [None]:
plt.figure(figsize=(30,20))
sns.heatmap(data.corr(),
           annot=True)

In [None]:
data.isna().sum()

In [None]:
data.dtypes

In [None]:
data['diagnosis'].value_counts()

In [None]:
sns.displot(data['diagnosis'])

In [None]:
# Convert object to int
for label, content in data.items():
    if pd.api.types.is_string_dtype(content):
        data[label] = pd.Categorical(content).codes+1
        print(label)


In [None]:
data.dtypes

In [None]:
# Split the dataset
X = data.drop('diagnosis', axis=1)
y = data['diagnosis']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)
len(X_train), len(X_test), len(y_train), len(y_test)

In [None]:
# RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, y_train)

print(f"Score: {clf.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf, X, y, cv=5))}")

In [None]:
# Logisticregression
from sklearn.linear_model import LogisticRegression
clf2 = LogisticRegression(random_state=42)
clf2.fit(X_train, y_train)

print(f"Score : {clf2.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf2, X, y, cv=5))}")


In [None]:
# GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier
clf3 = GradientBoostingClassifier(random_state=42)
clf3.fit(X_train, y_train)

print(f"Score : {clf3.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf3, X, y, cv=5))}")


In [None]:
# AdaBoostClassifier
from sklearn.ensemble import AdaBoostClassifier
clf4 = AdaBoostClassifier(random_state=42)
clf4.fit(X_train, y_train)

print(f"Score : {clf4.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf4, X, y, cv=5))}")

In [None]:
# XGBoost
from xgboost import XGBClassifier
clf5 = XGBClassifier(random_state=42)
clf5.fit(X_train, y_train)

print(f"Score : {clf5.score(X_test, y_test)}")
print(f"Cross_val_score : {np.mean(cross_val_score(clf5, X, y, cv=5))}")

In [None]:
# Does we got same accuracy for RandomForest, GradientBoost, Adaboost and XGBoost
# But when compared with "cross_val_score"  XGBoost performed the best



In [None]:
# Hyperparameter tuning with best
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'n_estimator' : [1000, 2000, 3000, 4000,5000],
    'min_child_weight' : [3,6,9,12,15],
    'max_depth' : [2, 4, 6, 8, 10],
    "max_leaf_nodes" : [2,4,6,8,10],
     
}

model = RandomizedSearchCV(XGBClassifier(random_state=42),
                         param_distributions = param_grid,
                         cv=10,
                         n_iter=10,
                         verbose=True)
model.fit(X_train,y_train)

In [None]:
model.score(X_train, y_train)

In [None]:
y_preds = model.predict(X_test)
y_preds


In [None]:
# Evaluation metrics
from sklearn.metrics import confusion_matrix
conn = confusion_matrix(y_test, y_preds)
conn

In [None]:
sns.heatmap(conn,
           annot=True,
           cmap='YlGnBu')
plt.title('Confusion Matrix')
plt.xlabel('True label')
plt.ylabel('Predicted label')

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print(f" Accuracy : {accuracy_score(y_test, y_preds)*100:.2f}%")
print(f" Precision : {precision_score(y_test, y_preds)}")
print(f" Recall : {recall_score(y_test, y_preds)}")
print(f" F1 : {f1_score(y_test, y_preds)}")

In [None]:
from sklearn.metrics import classification_report
classification_report(y_test, y_preds)