## Read and clean data as needed

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

In [None]:
for dirname, _, filenames in os.walk('/Resources'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("Resources/data.csv")
df.head()

In [None]:
# check dataset
df.describe()

In [None]:
# check columns
df.columns

In [None]:
# drop the last column
df = df.drop(columns = 'Unnamed: 32')

In [None]:
# check data
df

## Preliminary Analysis (Descriptives/Statistics Summary)

In [None]:
# drop id column for further analysis
updated_df = df.drop(columns='id')

# check dataset
updated_df.head()

In [None]:
sns.set(style="darkgrid")

# Counts of benign and malignant tumors
ax_bar = sns.countplot(x="diagnosis", data=updated_df, palette="Set3")

In [None]:
# recode diagnosis column to 1 (malignant) and 0 (benign)
def tumor(row):
    if row['diagnosis'] == 'B':
        return 0
    if row['diagnosis'] == 'M':
        return 1
    
# create a new column with the recoded values
updated_df['tumor'] = updated_df.apply (lambda row: tumor(row), axis=1)

# calculate correlation coefficients
corr_df = updated_df.corr()
corr_df.head()

In [None]:
# correlation heatmap
f,ax1 = plt.subplots(figsize=(18, 18))
sns.heatmap(updated_df.corr(), cmap='BuPu',annot=True, linewidths=.5, fmt= '.1f',ax=ax1)
plt.xticks(fontsize=11,rotation=70)
plt.show()

## Prep Data for Classifical Models

In [None]:
# create the y variable
y= df['diagnosis'].map({'M':1,'B':0})

In [None]:
# create a dataframe with selected features based on correlation results (keeping those with coefficient of .5 and above)
X = df[['radius_mean', 'perimeter_mean', 'area_mean',
       'compactness_mean', 'concavity_mean', 'concave points_mean',
       'radius_se', 'perimeter_se', 'area_se', 
       'radius_worst', 'texture_worst',
       'perimeter_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst']]

In [None]:
# create train and test dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33, random_state=42)
N ,D = X_train.shape

In [None]:
# scale/normalize the train and test data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
# This function runs the Predictor. It prints how many inaccurate predictions were made and whether the
# model is over predicting or under predicting
def Predictor(classifier, name):
    predictions = classifier.predict(X_test)
    predictions_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
    predictions_df["Sum"] = predictions_df.sum(axis=1)
    predictions_df = predictions_df[predictions_df.Sum == 1]

    inaccurate_predictions = len(predictions_df)
    false_positives = predictions_df["Prediction"].sum()
    false_negatives = predictions_df["Actual"].sum()
    difference = false_positives - false_negatives

    print(f"Results for {name} classifier using test data:\n"
          f"Inaccurate Predictions: {inaccurate_predictions}\n"
          f"False Positives: {false_positives}\n"
          f"False Negatives: {false_negatives}\n"
          f"Difference (positive is good): {difference}\n")
    
    # plot confusion matrix
    cf_matrix = confusion_matrix(y_test, predictions)
    group_names = ['TN', 'FP', 'FN', 'TP']
    group_counts = ["{:.0f}".format(value) for value in cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    categories = ['Benign', 'Malignant']
    sns.heatmap(cf_matrix, fmt = '', annot = labels, xticklabels = categories, yticklabels = categories, cmap = 'BuPu')
    plt.ylabel('Actual')
    plt.xlabel('Prediction')
    
    # plot ROC curves
    fpr, tpr, _ = roc_curve(predictions,y_test)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.title(f"Receiver operating characteristic - {name}")
    plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')    
    plt.legend(loc="lower right")
    plt.show()

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

In [None]:
logreg.fit(X_train, y_train)

print(f"Training Data Score: {logreg.score(X_train, y_train)}")
print(f"Testing Data Score: {logreg.score(X_test, y_test)}")

In [None]:
Predictor(logreg, 'Logistic Regression')

## Support Vector Machines

## Decision Tree Algorithm

In [None]:
from sklearn import tree

In [None]:
feature_names = X.columns

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
Predictor(clf, "Decision Tree")

## Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)
rf.score(X_test, y_test)

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
Predictor(rf, "Random Forest")

## K Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
print('k=3 Test Acc: %.3f' % knn.score(X_test, y_test))

In [None]:
Predictor(knn, "K Nearest Neighbor")

## Running the Predictor on all models

In [None]:
#Predictor(classifier, "Logistic Regression") <-- needs to be updated with correct classifier
#Predictor(classifier, "Support Vector Machines") <-- needs to be updated with correct classifier
Predictor(clf, "Decision Tree")
Predictor(rf, "Random Forest")
Predictor(knn, "K Nearest Neighbor")

In [None]:
# remember to add svm

# creating a roc chart with all classifiers

lr_predictions = logreg.predict(X_test)
clf_predictions = clf.predict(X_test)
rf_predictions = rf.predict(X_test)
knn_predictions = knn.predict(X_test)

lr_fpr, lr_tpr, _ = roc_curve(lr_predictions,y_test)
clf_fpr, clf_tpr, _ = roc_curve(clf_predictions,y_test)
rf_fpr, rf_tpr, _ = roc_curve(rf_predictions,y_test)
knn_fpr, knn_tpr, _ = roc_curve(knn_predictions,y_test)

lr_roc_auc = auc(lr_fpr, lr_tpr)
clf_roc_auc = auc(clf_fpr, clf_tpr)
rf_roc_auc = auc(rf_fpr, rf_tpr)
knn_roc_auc = auc(knn_fpr, knn_tpr)

plt.subplots(1, figsize=(10,10))
plt.plot(lr_fpr, lr_tpr, color='darkorange',
         lw=1, label='Logistic Regression (area = %0.2f)' % lr_roc_auc)
plt.plot(clf_fpr, clf_tpr, color='darkgreen',
         lw=1, label='Decision Tree (area = %0.2f)' % clf_roc_auc)
plt.plot(rf_fpr, rf_tpr, color='purple',
         lw=1, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot(knn_fpr, knn_tpr, color='red',
         lw=1, label='K Nearest Neighbor (area = %0.2f)' % knn_roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
plt.show()