In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost.sklearn import XGBClassifier

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")
df.head(5)

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.drop(["Unnamed: 32"], axis=1, inplace=True)

In [None]:
rep_dict = {'B': 0.0, 'M': 1.0}
df['diagnosis'].replace(rep_dict, inplace=True)

In [None]:
d_df = df["diagnosis"].value_counts().reset_index()
fig = px.pie(d_df, values="diagnosis", names=["Benign", "Malignant"], hole=0.3, opacity = 0.8)
fig.update_layout(title = dict(text = "Pie Chart of Breast Cancer Diagnosis"))
fig.update_traces(textposition = "outside", textinfo = "percent+label")
fig.show()

In [None]:
plt.figure(figsize=(25,25))
corr = df.corr(method="spearman")
ax = sns.heatmap(corr, square=True, linewidth=1, annot=True, fmt=".2f", cmap="coolwarm")
plt.show()

In [None]:
X = df.drop(["diagnosis", "id"], axis=1)
y = df["diagnosis"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

models = {"KNN": KNeighborsClassifier(),
         "Random Forest": RandomForestClassifier(),
         "Decision Tree": DecisionTreeClassifier(),
         "Naive Bayes": GaussianNB(),
         "Support Vector Classifier": SVC(),
         "xgb Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss')}

def fit_and_score(models, X_train, X_test, y_train, y_test):
    # Set random seed
    np.random.seed(0)
    # Make a dictionary to keep model scores
    model_scores = {}
    # Loop through models
    for name, model in models.items():
        # Fit model to data
        model.fit(X_train, y_train)
        # Evaluate model and append its score to model_scores
        model_scores[name] = cross_val_score(model,
                                             X_test,
                                             y_test,
                                             scoring='accuracy',
                                             cv=5
                                             ).mean()

    return model_scores

In [None]:
model_score = fit_and_score(models,X_train,X_test,y_train,y_test)

In [None]:
model_score

In [None]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)
print(cross_val_score(model, X_test, y_test, scoring='accuracy', cv=5).mean())

In [None]:
def plot_features(columns, importances,n=None):
    df = (pd.DataFrame({"features": columns,
                       "feature_importances": importances})
         .sort_values("feature_importances", ascending=False)
         .reset_index(drop=True))
    # Plot dataframe
    fix, ax = plt.subplots(figsize=(20,15))
    if n:
        ax.barh(df["features"][:n], df["feature_importances"][:n])
    else:
        ax.barh(df["features"][:], df["feature_importances"][:])
    ax.set_ylabel("Features")
    ax.set_xlabel("Feature Importance")
    ax.invert_yaxis()
    
plot_features(df.drop(['diagnosis', 'id'],axis=1).columns, model.feature_importances_)

In [None]:
def backward_elim(model, X, y, patience=3):
    drop_arr = []
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
    acc = [0]
    model.fit(X_train, y_train)
    acc.append(cross_val_score(model, X_test, y_test, scoring='accuracy', cv=5).mean())
    while acc[-1] >= acc[-2]:
        feature_df = (pd.DataFrame({"features": X.columns,
                                    "feature_importances": model.feature_importances_})
                      .sort_values("feature_importances", ascending=False)
                      .reset_index(drop=True))
        
        weakest_feature = feature_df.iloc[-1].features
        drop_arr.append(weakest_feature) 
        X.drop([weakest_feature], axis=1, inplace=True)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)
        model.fit(X_train, y_train)
        acc.append(cross_val_score(model, X_test, y_test, scoring='accuracy', cv=5).mean())
    return model, drop_arr, acc
        
        

In [None]:
X = df.drop(["diagnosis", "id"], axis=1)
y = df["diagnosis"]
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model, drop_arr, acc = backward_elim(model, X, y)

In [None]:
plot_features(X.columns, model.feature_importances_)