## Read and clean data as needed

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc

In [None]:
for dirname, _, filenames in os.walk('/Resources'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("Resources/data.csv")
df.head()

In [None]:
df.dtypes

In [None]:
# check dataset
df.describe()

In [None]:
# check columns
df.columns

In [None]:
# drop the last column
df = df.drop(columns = 'Unnamed: 32')

In [None]:
# check if there is any missing data
df.isna().sum()

In [None]:
# check data
df.head()

## Preliminary Analysis (Descriptives/Statistics Summary)

In [None]:
# understanding the data variables

# a) radius (mean of distances from center to points on the perimeter)
# b) texture (standard deviation of gray-scale values)
# c) perimeter
# d) area
# e) smoothness (local variation in radius lengths)
# f) compactness (perimeter^2 / area - 1.0)
# g) concavity (severity of concave portions of the contour)
# h) concave points (number of concave portions of the contour)
# i) symmetry
# j) fractal dimension ("coastline approximation" - 1)

In [None]:
# drop id column for further analysis
updated_df = df.drop(columns='id')

# check dataset
updated_df.head()

In [None]:
# count no. of benign and malignant tumors
count_df = updated_df.groupby('diagnosis')
print(count_df['diagnosis'].count())

sns.set(style="darkgrid")

# create bar chart
ax_bar = sns.countplot(x="diagnosis", data=updated_df, palette="Set1")
plt.title("Count of Benign and Malignant Tumors");
plt.ylabel("Count");
plt.xlabel("Diagnosis");
# plt.savefig(f'Resources/Count - tumor diagnosis.jpg', dpi=300)
plt.show()

In [None]:
# recode diagnosis column to 1 (malignant) and 0 (benign)
def tumor(row):
    if row['diagnosis'] == 'B':
        return 0
    if row['diagnosis'] == 'M':
        return 1
    
# create a new column with the recoded values
updated_df['tumor'] = updated_df.apply (lambda row: tumor(row), axis=1)

# calculate correlation coefficients
corr_df = updated_df.corr()
corr_df

In [None]:
# since there are ~29 features in the data, we have decided to look at the correlation heatmap to determine the features
# that we want to include in our classification models

# correlation heatmap
f,ax1 = plt.subplots(figsize=(20, 20))
sns.heatmap(updated_df.corr(), cmap='BuPu',annot=True, linewidths=.5, fmt= '.2f',ax=ax1)
plt.xticks(fontsize=11,rotation=70)
# plt.savefig(f'Resources/Corr Heatmap.jpg', dpi=300)
plt.show()

## Prep Data for Classifical Models

In [None]:
# create the y variable
y= df['diagnosis'].map({'M':1,'B':0})

In [None]:
# create a dataframe with selected features based on correlation results (keeping those with coefficient of .5 and above)
X = df[['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 
        'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'area_worst', 'perimeter_worst', 'compactness_worst', 
        'concavity_worst', 'concave points_worst']]

In [None]:
# create train and test dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .33, random_state=42)
N ,D = X_train.shape

In [None]:
# scale/normalize the train and test data

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [None]:
# create a list to store classifier accuracy
classifier_accu = []
classifier_list = []

In [None]:
# This function runs the Predictor. It prints how many inaccurate predictions were made and whether the
# model is over predicting or under predicting
def Predictor(classifier, name):
    if name == 'Sequential Model':
        seq_predict = classifier.predict_classes(X_test)
        predictions = []
        for sublist in seq_predict:
            for item in sublist:
                predictions.append(item)
        train_test_score = classifier.evaluate(X_test,y_test)
        score = train_test_score[1]
    else:
        predictions = classifier.predict(X_test)
        score = classifier.score(X_test, y_test)
    
    predictions_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)
    predictions_df["Sum"] = predictions_df.sum(axis=1)
    predictions_df = predictions_df[predictions_df.Sum == 1]
    
    inaccurate_predictions = len(predictions_df)
    false_positives = predictions_df["Prediction"].sum()
    false_negatives = predictions_df["Actual"].sum()
    difference = false_positives - false_negatives

    results_string = (f"Results for {name} classifier:\n"
                      f"-------------------------------------\n"
                      f"Score: {score}\n"
                      f"Inaccurate Predictions: {inaccurate_predictions}\n"
                      f"False Positives: {false_positives}\n"
                      f"False Negatives: {false_negatives}\n"
                      f"Difference (positive is good): {difference}\n")
    print(results_string)
    
#     text_file = open(f'Resources/Results/{name}.txt', 'w')
#     text_file.write(results_string)
#     text_file.close()
    
    # plot confusion matrix
    cf_matrix = confusion_matrix(y_test, predictions)
    group_names = ['TN', 'FP', 'FN', 'TP']
    group_counts = ["{:.0f}".format(value) for value in cf_matrix.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in cf_matrix.flatten()/np.sum(cf_matrix)]
    labels = [f"{v1}\n{v2}\n{v3}" for v1, v2, v3 in zip(group_names,group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    categories = ['Benign', 'Malignant']
    sns.heatmap(cf_matrix, fmt = '', annot = labels, xticklabels = categories, yticklabels = categories, cmap = 'BuPu')
    plt.ylabel('Actual')
    plt.xlabel('Prediction')
    plt.title(name)
#     plt.savefig(f'Resources/Results/{name}.jpg', dpi=300)
    # plot ROC curves
    fpr, tpr, _ = roc_curve(predictions,y_test)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.title(name)
    plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')    
    plt.legend(loc="lower right")
#     plt.savefig(f'Resources/Results/{name}2.jpg', dpi=300)
    plt.show()
    

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()

In [None]:
logreg.fit(X_train, y_train)

log_reg_accu = logreg.score(X_test, y_test)
classifier_accu.append(log_reg_accu)
classifier_list.append("Logistic Regression")

print(f"Training Data Score: {logreg.score(X_train, y_train)}")
print(f"Testing Data Score: {logreg.score(X_test, y_test)}")

In [None]:
features = ['radius_mean', 'perimeter_mean', 'area_mean', 'compactness_mean', 'concavity_mean', 'concave points_mean', 
        'radius_se', 'perimeter_se', 'area_se', 'radius_worst', 'area_worst', 'perimeter_worst', 'compactness_worst', 
        'concavity_worst', 'concave points_worst']

# get importance
importance = logreg.coef_[0]

# summarize feature importance
for i,(v1,v2) in enumerate(zip(importance,features)):
    print(f"Feature %0d: {v2}, Score: %.5f" % (i,v1))
# plot feature importance
plt.bar([x for x in range(len(importance))], importance)
# plt.savefig(f'Resources/LogReg - Features Importance.jpg', dpi=300)
plt.show()

In [None]:
Predictor(logreg, 'Logistic Regression')

## Support Vector Machines

In [None]:
updated_df.head()

In [None]:
target = updated_df['diagnosis']
target_names = ['M', 'B']

In [None]:
svm_data = updated_df.drop('diagnosis', axis=1)
feature_names = svm_data.columns
svm_data.head()

In [None]:
 # Support vector machine linear classifier
from sklearn.svm import SVC 
vector = SVC(kernel='linear')
vector.fit(X_train, y_train)

In [None]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [None]:
# Model Accuracy

svc_accu = vector.score(X_test, y_test)
classifier_accu.append(svc_accu)
classifier_list.append("SVM")

print('Test Acc: %.3f' % vector.score(X_test, y_test))

In [None]:
 # Calculate classification report
from sklearn.metrics import classification_report
predictions = vector.predict(X_test)
print(classification_report(y_test, predictions,
                            target_names=target_names))

In [None]:
Predictor(vector, 'Support Vector Machine')

## Decision Tree Algorithm

In [None]:
from sklearn import tree

In [None]:
feature_names = X.columns

In [None]:
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X_train, y_train)

clf_accu = clf.score(X_test, y_test)
classifier_accu.append(clf_accu)
classifier_list.append("Decision Tree")

clf_accu

In [None]:
Predictor(clf, "Decision Tree")

## Random Forest Classification

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train, y_train)

rf_accu = rf.score(X_test, y_test)
classifier_accu.append(rf_accu)
classifier_list.append("Random Forest")

rf_accu

In [None]:
sorted(zip(rf.feature_importances_, feature_names), reverse=True)

In [None]:
Predictor(rf, "Random Forest")

## K Nearest Neighbor

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
train_scores = []
test_scores = []
for k in range(1, 20, 2):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    train_score = knn.score(X_train, y_train)
    test_score = knn.score(X_test, y_test)
    train_scores.append(train_score)
    test_scores.append(test_score)
    print(f"k: {k}, Train/Test Score: {train_score:.3f}/{test_score:.3f}")
    
plt.plot(range(1, 20, 2), train_scores, marker='o')
plt.plot(range(1, 20, 2), test_scores, marker="x")
plt.xlabel("k neighbors")
plt.ylabel("Testing accuracy Score")
# plt.savefig(f'Resources/Results/KNN.jpg', dpi=300)
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

knn_accu = knn.score(X_test, y_test)
classifier_accu.append(knn_accu)
classifier_list.append("KNN")

print('k=3 Test Acc: %.3f' % knn.score(X_test, y_test))

In [None]:
Predictor(knn, "K Nearest Neighbor")

# Sequential Model

In [None]:
import tensorflow as tf

In [None]:
### Now creating the model

model = tf.keras.models.Sequential([tf.keras.layers.Input(shape=(D,)),
                                    tf.keras.layers.Dense(1,activation='sigmoid')
])


In [None]:
# Compiling the model

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [None]:
#You can avoid this error by converting your labels to arrays before calling model.fit()
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)

In [None]:
r= model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=100)

In [None]:
# train and test score

seq_accu = model.evaluate(X_test, y_test)
seq_accu = seq_accu[1]
classifier_accu.append(seq_accu)
classifier_list.append("Sequential Model")

print("Train score",model.evaluate(X_train,y_train))
print("Test score",model.evaluate(X_test,y_test))

In [None]:
import matplotlib.pyplot as plt
plt.plot(r.history['loss'],label='loss')
plt.plot(r.history['val_loss'],label='val_loss')
plt.legend()
# plt.savefig(f'Resources/Sequential.jpg', dpi=300)
plt.show()

In [None]:
Predictor(model, 'Sequential Model')

## Overall view of all models

In [None]:
#Predictor(classifier, "Logistic Regression") <-- needs to be updated with correct classifier
#Predictor(classifier, "Support Vector Machines") <-- needs to be updated with correct classifier
#Predictor(clf, "Decision Tree")
#Predictor(rf, "Random Forest")
#Predictor(knn, "K Nearest Neighbor")
#Predictor(vector, "Support Vector Machine")

In [None]:
# remember to add svm

# creating a roc chart with all classifiers

lr_predictions = logreg.predict(X_test)
clf_predictions = clf.predict(X_test)
rf_predictions = rf.predict(X_test)
knn_predictions = knn.predict(X_test)
svm_predictions = vector.predict(X_test)

seq_predict = model.predict_classes(X_test)
seq_predictions = []
for sublist in seq_predict:
    for item in sublist:
        seq_predictions.append(item)

lr_fpr, lr_tpr, _ = roc_curve(lr_predictions,y_test)
clf_fpr, clf_tpr, _ = roc_curve(clf_predictions,y_test)
rf_fpr, rf_tpr, _ = roc_curve(rf_predictions,y_test)
knn_fpr, knn_tpr, _ = roc_curve(knn_predictions,y_test)
svm_fpr, svm_tpr, _ = roc_curve(svm_predictions,y_test)
seq_fpr, seq_tpr, _ = roc_curve(seq_predictions,y_test)

lr_roc_auc = auc(lr_fpr, lr_tpr)
clf_roc_auc = auc(clf_fpr, clf_tpr)
rf_roc_auc = auc(rf_fpr, rf_tpr)
knn_roc_auc = auc(knn_fpr, knn_tpr)
svm_roc_auc = auc(svm_fpr, svm_tpr)
seq_roc_auc = auc(seq_fpr, seq_tpr)

plt.subplots(1, figsize=(10,10))
plt.plot(lr_fpr, lr_tpr, color='darkorange',
         lw=1, label='Logistic Regression (area = %0.2f)' % lr_roc_auc)
plt.plot(clf_fpr, clf_tpr, color='darkgreen',
         lw=1, label='Decision Tree (area = %0.2f)' % clf_roc_auc)
plt.plot(rf_fpr, rf_tpr, color='purple',
         lw=1, label='Random Forest (area = %0.2f)' % rf_roc_auc)
plt.plot(knn_fpr, knn_tpr, color='red',
         lw=1, label='K Nearest Neighbor (area = %0.2f)' % knn_roc_auc)
plt.plot(svm_fpr, svm_tpr, color ='pink',
        lw=1, label='Support Vector Machine (area = %0.2f)' % svm_roc_auc)
plt.plot(seq_fpr, seq_tpr, color ='blue',
        lw=1, label='Sequential Model (area = %0.2f)' % seq_roc_auc)

plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc="lower right")
# plt.savefig(f'Resources/ROC - ALL models.jpg', dpi=300)
plt.show()

In [None]:
# creating dataframe with classifier accuracy
accu_df = pd.DataFrame(list(zip(classifier_list, classifier_accu)), 
               columns =['Classifier', 'Accuracy']) 
accu_df.sort_values(by='Accuracy', ascending=False, inplace=True)

#write html to file
accu_df.to_html("templates/table.html", index=False, header=True)
accu_df

In [None]:
# create bar chart
ax_accu_bar = accu_df.plot.bar(x='Classifier', y='Accuracy', rot=45, legend=False)
plt.title("Classifiers Accuracy");
plt.ylabel("Accuracy");
plt.xlabel("Classification Models");
plt.ylim(0.85,1)
plt.tight_layout()
# plt.savefig(f'Resources/Classifiers Accuracy.jpg', dpi=300)
plt.show()