## Read and clean data as needed

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap

In [None]:
for dirname, _, filenames in os.walk('/Resources'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
df = pd.read_csv("Resources/data.csv")
df.head()

In [None]:
# check dataset
df.describe()

In [None]:
# check columns
df.columns

In [None]:
# drop the last column
df = df.drop(columns = 'Unnamed: 32')

In [None]:
# check data
df

## Preliminary Analysis (Descriptives/Statistics Summary)

In [None]:
# drop id column for further analysis
updated_df = df.drop(columns='id')

# check dataset
updated_df.head()

In [None]:
sns.set(style="darkgrid")

# Counts of benign and malignant tumors
ax_bar = sns.countplot(x="diagnosis", data=updated_df, palette="Set3")

In [None]:
# recode diagnosis column to 1 (malignant) and 0 (benign)
def tumor(row):
    if row['diagnosis'] == 'B':
        return 0
    if row['diagnosis'] == 'M':
        return 1
    
# create a new column with the recoded values
updated_df['tumor'] = updated_df.apply (lambda row: tumor(row), axis=1)

# calculate correlation coefficients
corr_df = updated_df.corr()
corr_df.head()

In [None]:
# correlation map
f,ax1 = plt.subplots(figsize=(18, 18))
sns.heatmap(updated_df.corr(), cmap='BuPu',annot=True, linewidths=.5, fmt= '.1f',ax=ax1)
plt.xticks(fontsize=11,rotation=70)
plt.show()

## Prep Data for Classifical Models

In [None]:
# create the y variable
y= df['diagnosis'].map({'M':1,'B':0})

In [None]:
# create a dataframe with selected features based on correlation results (keeping those with coefficient of .5 and above)
X = df[['radius_mean', 'perimeter_mean', 'area_mean',
       'compactness_mean', 'concavity_mean', 'concave points_mean',
       'radius_se', 'perimeter_se', 'area_se', 
       'radius_worst', 'texture_worst',
       'perimeter_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst']]

In [None]:
# create train and test dataset

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)
N ,D = X_train.shape

## Logistic Regression

## Support Vector Machines

## Decision Tree Algorithm

## Random Forest Classification

## Nearest Neighbor