In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, roc_auc_score, roc_curve
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor, NeighborhoodComponentsAnalysis
from sklearn.decomposition import PCA
from lightgbm import LGBMClassifier

# READING DATA

In [None]:
cancer = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")
df = cancer.copy()
df.head()

"id" and "Unnamed: 32" columns are useless for project so we can drop them.

In [None]:
df.drop(columns=["id", "Unnamed: 32"], axis = 1, inplace=True)

In this data set there is one object variable = diagnosis

In [None]:
df.info()

# Missing Values
There is not any missing value.

In [None]:
df.isnull().sum()

# EXAMINE TARGET VARIABLE AND LABEL ENCODER

In [None]:
sns.countplot(df["diagnosis"], )

In [None]:
le = LabelEncoder()
df["Diagnosis"] = le.fit_transform(df["diagnosis"])

In [None]:
df.drop(columns=["diagnosis"], axis=1, inplace=True)
df.head()

# EDA

Correlation is a term that is a measure of the strength of a linear relationship between two quantitative variables.

In [None]:
corr_matrix = df.corr()
plt.figure(figsize=(15,15))
plt.title("Correlation Between Features")
sns.heatmap(corr_matrix, annot=True, fmt=".2f")

In [None]:
threshold = 0.7
filter = np.abs(corr_matrix["Diagnosis"]) > threshold
corr_features = corr_matrix.columns[filter].tolist()
plt.title("CORRELATION BETWEEN FEATURES (CORR > 0.7)")
sns.heatmap(df[corr_features].corr(), annot=True, fmt=".2f")

In [None]:
sns.pairplot(df[corr_features], hue="Diagnosis")

### Histogram
A histogram is a bar graph representation of a grouped data distribution. In other words, it is the transfer of data consisting of repetitive numbers to the table first, and to the chart by using the table, in other words, the graph of the data groups is displayed in rectangular columns.

In [None]:
df.hist(corr_features, figsize=(10,10));

# OUTLIERS

An outlier is an observation that lies an abnormal distance from other values in a random sample from a population. In a sense, this definition leaves it up to the analyst (or a consensus process) to decide what will be considered abnormal. Before abnormal observations can be singled out, it is necessary to characterize normal observations.

In [None]:
def OutliersBox(df, nameOfFeature):
    trace0 = go.Box(y = df[nameOfFeature],
                    name = "All Points",
                    jitter = 0.3,
                    pointpos = -1.8,
                    boxpoints = "all")
    trace1 = go.Box(y = df[nameOfFeature],
                    name = "Only Whiskers",
                    boxpoints = False)
    trace2 = go.Box(y = df[nameOfFeature],
                    name = "Suspected Outliers",
                    boxpoints = "suspectedoutliers",
                    marker = dict(color = 'rgb(8,81,156)',
                                outliercolor = 'rgba(219, 64, 82, 0.6)', line = dict(outliercolor = 'rgba(219, 64, 82, 0.6)',
                                                                                   outlierwidth = 2)),
                    line = dict(color = 'rgb(8,81,156)') )
    trace3 = go.Box(y = df[nameOfFeature],
                    name = "Whiskers and Outliers",
                    boxpoints = "outliers")
    
    data_ = [trace0, trace1, trace2, trace3]
    layout_ = go.Layout(
        title = "{} Outliers".format(nameOfFeature)
    )
    fig = go.Figure(data=data_, layout = layout_)
    py.iplot(fig, filename = "Outliers")

We will examine corr_features outliers.

In [None]:
OutliersBox(df, corr_features[0])

In [None]:
OutliersBox(df, corr_features[1])

In [None]:
OutliersBox(df, corr_features[2])

In [None]:
OutliersBox(df, corr_features[3])

In [None]:
OutliersBox(df, corr_features[4])

In [None]:
OutliersBox(df, corr_features[5])

In [None]:
OutliersBox(df, corr_features[6])

In [None]:
OutliersBox(df, corr_features[7])

In [None]:
y = df.Diagnosis
X = df.drop(["Diagnosis"], axis=1)
columns = X.columns.tolist()

In [None]:
clf = LocalOutlierFactor()
y_pred_outlier = clf.fit_predict(X)
X_score = clf.negative_outlier_factor_
outlier_score = pd.DataFrame()
outlier_score["score"] = X_score

In [None]:
threshold = -1.75
filter_outlier = outlier_score["score"] < threshold
outlier_index = outlier_score[filter_outlier].index.tolist()

In [None]:
plt.figure(figsize=(14,8))
plt.scatter(X.iloc[outlier_index,0], X.iloc[outlier_index,1], color="blue", s=50,
            label="Outliers")
plt.scatter(X.iloc[:,0], X.iloc[:,1], color="k", s=3, label="Data Points")

radius = (X_score.max() - X_score) / (X_score.max() - X_score.min()) 
outlier_score["radius"] = radius
plt.scatter(X.iloc[:,0], X.iloc[:,1], s=1000*radius, edgecolors="r", 
            facecolors="none", label="Outlier Scores")
plt.legend()
plt.show()

In [None]:
X = X.drop(outlier_index)
y = y.drop(outlier_index).values

# SPLITTING AND SCALING DATA

Above, we first gave all variables except the "Diagnosis" variable to the X variable and gave the variable "Diagnosis" to the y variable. Then we split the data into train and test data. X_train and y_train show the dependent and independent variables to be used to test the model, while X_test and y_test are used to develop the model. Test_size specifies how many of data (20%) will be used for testing. Random_state is used to see the same distinction every time we run the program. Stratify provides a balanced separation of classes in the y variable when separating.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 3, stratify=y)

We use Standard Scalar in order to scale the magnitude of the feature in a certain range. Generally, what data we get from the real world, they have a great difference between them and that have direct impact over the performance of the model. So, it’s always a best practice to scale the data before processing it

In [None]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [None]:
X_train_df = pd.DataFrame(X_train_scaled, columns=columns)
X_train_df["target"] = y_train
data_melted_2 = pd.melt(X_train_df, id_vars="target",
                        var_name="features",
                        value_name="value")
plt.figure(figsize=(18,10))
plt.title("BOX PLOT AFTER SCALING")
sns.boxplot(x="features", y="value", hue="target", data=data_melted_2)
plt.xticks(rotation=90);

# USING KNN

* K-Nearest Neighbour is one of the simplest Machine Learning algorithms based on Supervised Learning technique.
* K-NN algorithm assumes the similarity between the new case/data and available cases and put the new case into the category that is most similar to the available categories.
* K-NN algorithm stores all the available data and classifies a new data point based on the similarity. This means when new data appears then it can be easily classified into a well suite category by using K- NN algorithm.
* K-NN algorithm can be used for Regression as well as for Classification but mostly it is used for the Classification problems.

We will use plot_roc_curve method for the plottin ROC curve in methods.

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0,1],[0,1],"k--")
    plt.axis([0,1,0,1])
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")

In [None]:
def KNN_best_params(X_train, X_test, y_train, y_test):
    k_range = np.arange(1,31)
    weight = ["uniform", "distance"]
    params = dict(n_neighbors = k_range, weights = weight)
    
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn, params, cv=10, scoring="accuracy", n_jobs=-1, verbose=2)
    grid.fit(X_train, y_train)
    
    print("Best training score: {} wtih params: {}".format(grid.best_score_,grid.best_params_))
    
    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(X_train, y_train)
    y_pred_test = knn.predict(X_test)
    y_pred_train = knn.predict(X_train)
    
    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_train = confusion_matrix(y_train, y_pred_train)
    
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    y_pred_proba = knn.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    
    print("Test Score: {}, Train Score: {}".format(acc_test,acc_train))
    print("CM TEST")
    print(cm_test)
    print("CM TRAIN")
    print(cm_train)
    print("Precision Score", precision_score(y_test, y_pred_test))
    print("recall Score",recall_score(y_test, y_pred_test))
    print("ROC Score", roc_auc_score(y_test, y_pred_proba))
    plot_roc_curve(fpr, tpr, thresholds)
    
    return grid

In [None]:
grid = KNN_best_params(X_train_scaled, X_test_scaled, y_train, y_test)

# USING LGBM 

LightGBM, short for Light Gradient Boosting Machine, is a free and open source distributed gradient boosting framework for machine learning originally developed by Microsoft. It is based on decision tree algorithms and used for ranking, classification and other machine learning tasks. The development focus is on performance and scalability.

In [None]:
def lgbm_best_params(X_train, X_test, y_train, y_test):
    lgbm_params = {"n_estimators" : [100,200,500,1000,2000],
               "subsample" : [0.6,0.8,1.0],
               "max_depth" : [5,10,15,20,25,30,35],
               "learning_rate" : [0.1, 0.01, 0.02, 0.5],
               "min_child_samples" : np.arange(2,50)}
    lgbm = LGBMClassifier()
    random = RandomizedSearchCV(lgbm, lgbm_params, cv=10, random_state=1, n_jobs=-1, verbose=2)
    #grid = GridSearchCV(lgbm, lgbm_params, cv=10, verbose=2, n_jobs=-1)
    random.fit(X_train, y_train)
    
    print("Best training score: {} wtih params: {}".format(random.best_score_,random.best_params_))
    
    lgbm = LGBMClassifier(**random.best_params_)
    lgbm.fit(X_train, y_train)
    y_pred_test = lgbm.predict(X_test)
    y_pred_train = lgbm.predict(X_train)
    
    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_train = confusion_matrix(y_train, y_pred_train)
    
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    
    y_pred_proba = lgbm.predict_proba(X_test)[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
    
    print("Test Score: {}, Train Score: {}".format(acc_test,acc_train))
    print("CM TEST")
    print(cm_test)
    print("CM TRAIN")
    print(cm_train)
    print("Precision Score", precision_score(y_test, y_pred_test))
    print("recall Score",recall_score(y_test, y_pred_test))
    print("ROC Score", roc_auc_score(y_test, y_pred_proba))
    plot_roc_curve(fpr, tpr, thresholds)
    
    return grid

In [None]:
random_lgbm = lgbm_best_params(X_train_scaled, X_test_scaled, y_train, y_test)

# USING PCA

Principal component analysis, or PCA, is a statistical procedure that allows you to summarize the information content in large data tables by means of a smaller set of “summary indices” that can be more easily visualized and analyzed.

In [None]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
pca = PCA(n_components=2)
pca.fit(X_scaled)
X_reduced_pca = pca.transform(X_scaled)

In [None]:
pca_data = pd.DataFrame(X_reduced_pca, columns=["p1","p2"])
pca_data["target"] = y
plt.figure(figsize=(14,8))
sns.scatterplot(x="p1", y="p2", hue="target", data=pca_data)

In [None]:
X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_reduced_pca, y, test_size = 0.20, random_state = 3, stratify=y)

In [None]:
grid_pca = KNN_best_params(X_train_pca, X_test_pca, y_train_pca, y_test_pca)

# USING NCA

Neighbourhood components analysis is a supervised learning method for classifying multivariate data into distinct classes according to a given distance metric over the data. Functionally, it serves the same purposes as the K-nearest neighbors algorithm, and makes direct use of a related concept termed stochastic nearest neighbours.

In [None]:
nca = NeighborhoodComponentsAnalysis(n_components=2, random_state=42)
nca.fit(X_scaled, y)
X_reduced_nca = nca.transform(X_scaled)

In [None]:
nca_data = pd.DataFrame(X_reduced_nca, columns=["p1","p2"])
nca_data["target"] = y
plt.figure(figsize=(14,8))
sns.scatterplot(x="p1", y="p2", hue="target", data=nca_data)

In [None]:
X_train_nca, X_test_nca, y_train_nca, y_test_nca = train_test_split(X_reduced_nca, y, test_size = 0.20, random_state = 3, stratify=y)

In [None]:
grid_nca = KNN_best_params(X_train_nca, X_test_nca, y_train_nca, y_test_nca)

### As a result we have 0.99 accuracy score