In [1]:
# generic data science libraries
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# import scikit-learn
from sklearn.model_selection import train_test_split

# models
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

# metrics
from sklearn.metrics import accuracy_score, classification_report, auc, confusion_matrix, roc_curve

In [2]:
df = pd.read_csv("Final_Clean.csv")
df

Unnamed: 0.1,Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,perimeter_mean,area_worst,radius_mean,area_mean
0,0,1,D,D,D,D,122.80,2019.0,17.99,1001.0
1,1,1,D,D,C,D,132.90,1956.0,20.57,1326.0
2,2,1,D,D,D,D,130.00,1709.0,19.69,1203.0
3,3,1,D,C,D,B,77.58,567.7,11.42,386.1
4,4,1,D,D,D,D,135.10,1575.0,20.29,1297.0
...,...,...,...,...,...,...,...,...,...,...
564,564,1,D,D,D,D,142.00,2027.0,21.56,1479.0
565,565,1,D,D,D,D,131.20,1731.0,20.13,1261.0
566,566,1,C,D,C,C,108.30,1124.0,16.60,858.1
567,567,1,D,D,D,D,140.10,1821.0,20.60,1265.0


In [3]:
df2 = df.drop(columns=['Unnamed: 0'])
df2

Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,perimeter_mean,area_worst,radius_mean,area_mean
0,1,D,D,D,D,122.80,2019.0,17.99,1001.0
1,1,D,D,C,D,132.90,1956.0,20.57,1326.0
2,1,D,D,D,D,130.00,1709.0,19.69,1203.0
3,1,D,C,D,B,77.58,567.7,11.42,386.1
4,1,D,D,D,D,135.10,1575.0,20.29,1297.0
...,...,...,...,...,...,...,...,...,...
564,1,D,D,D,D,142.00,2027.0,21.56,1479.0
565,1,D,D,D,D,131.20,1731.0,20.13,1261.0
566,1,C,D,C,C,108.30,1124.0,16.60,858.1
567,1,D,D,D,D,140.10,1821.0,20.60,1265.0


In [4]:
df2.describe()

Unnamed: 0,diagnosis,perimeter_mean,area_worst,radius_mean,area_mean
count,569.0,569.0,569.0,569.0,569.0
mean,0.372583,91.969033,880.583128,14.127292,654.889104
std,0.483918,24.298981,569.356993,3.524049,351.914129
min,0.0,43.79,185.2,6.981,143.5
25%,0.0,75.17,515.3,11.7,420.3
50%,0.0,86.24,686.5,13.37,551.1
75%,1.0,104.1,1084.0,15.78,782.7
max,1.0,188.5,4254.0,28.11,2501.0


In [5]:
# Create bins in which to place values based upon TED Talk views
bins = [43, 75, 86,104, 99999999]

# Create labels for these bins
group_labels = ["A", "B","C","D"]

df2['perimeter_mean_bins'] = pd.cut(df2['perimeter_mean'], bins, labels=group_labels, include_lowest=True)
df2.head()

Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,perimeter_mean,area_worst,radius_mean,area_mean,perimeter_mean_bins
0,1,D,D,D,D,122.8,2019.0,17.99,1001.0,D
1,1,D,D,C,D,132.9,1956.0,20.57,1326.0,D
2,1,D,D,D,D,130.0,1709.0,19.69,1203.0,D
3,1,D,C,D,B,77.58,567.7,11.42,386.1,B
4,1,D,D,D,D,135.1,1575.0,20.29,1297.0,D


In [6]:
df3=df2.drop(columns=['perimeter_mean'])
df3.head()

Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,area_worst,radius_mean,area_mean,perimeter_mean_bins
0,1,D,D,D,D,2019.0,17.99,1001.0,D
1,1,D,D,C,D,1956.0,20.57,1326.0,D
2,1,D,D,D,D,1709.0,19.69,1203.0,D
3,1,D,C,D,B,567.7,11.42,386.1,B
4,1,D,D,D,D,1575.0,20.29,1297.0,D


In [7]:
# Create bins in which to place values based upon TED Talk views
bins = [185, 515, 696,1084, 99999999]

# Create labels for these bins
group_labels = ["A", "B","C","D"]

df3['area_worst_bins'] = pd.cut(df3['area_worst'], bins, labels=group_labels, include_lowest=True)
df3.head()

Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,area_worst,radius_mean,area_mean,perimeter_mean_bins,area_worst_bins
0,1,D,D,D,D,2019.0,17.99,1001.0,D,D
1,1,D,D,C,D,1956.0,20.57,1326.0,D,D
2,1,D,D,D,D,1709.0,19.69,1203.0,D,D
3,1,D,C,D,B,567.7,11.42,386.1,B,B
4,1,D,D,D,D,1575.0,20.29,1297.0,D,D


In [8]:
df4=df3.drop(columns=['area_worst'])
df4.head()

Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,radius_mean,area_mean,perimeter_mean_bins,area_worst_bins
0,1,D,D,D,D,17.99,1001.0,D,D
1,1,D,D,C,D,20.57,1326.0,D,D
2,1,D,D,D,D,19.69,1203.0,D,D
3,1,D,C,D,B,11.42,386.1,B,B
4,1,D,D,D,D,20.29,1297.0,D,D


In [9]:
# Create bins in which to place values based upon TED Talk views
bins = [6, 12, 14,16, 99999999]

# Create labels for these bins
group_labels = ["A", "B","C","D"]

df4['radius_mean_bins'] = pd.cut(df4['radius_mean'], bins, labels=group_labels, include_lowest=True)
df4.head()

Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,radius_mean,area_mean,perimeter_mean_bins,area_worst_bins,radius_mean_bins
0,1,D,D,D,D,17.99,1001.0,D,D,D
1,1,D,D,C,D,20.57,1326.0,D,D,D
2,1,D,D,D,D,19.69,1203.0,D,D,D
3,1,D,C,D,B,11.42,386.1,B,B,A
4,1,D,D,D,D,20.29,1297.0,D,D,D


In [10]:
df5=df4.drop(columns=['radius_mean'])
df5.head()

Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,area_mean,perimeter_mean_bins,area_worst_bins,radius_mean_bins
0,1,D,D,D,D,1001.0,D,D,D
1,1,D,D,C,D,1326.0,D,D,D
2,1,D,D,D,D,1203.0,D,D,D
3,1,D,C,D,B,386.1,B,B,A
4,1,D,D,D,D,1297.0,D,D,D


In [11]:
# Create bins in which to place values based upon TED Talk views
bins = [143, 420, 551,782, 99999999]

# Create labels for these bins
group_labels = ["A", "B","C","D"]

df5['area_mean_bins'] = pd.cut(df5['area_mean'], bins, labels=group_labels, include_lowest=True)
df5.head()

Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,area_mean,perimeter_mean_bins,area_worst_bins,radius_mean_bins,area_mean_bins
0,1,D,D,D,D,1001.0,D,D,D,D
1,1,D,D,C,D,1326.0,D,D,D,D
2,1,D,D,D,D,1203.0,D,D,D,D
3,1,D,C,D,B,386.1,B,B,A,A
4,1,D,D,D,D,1297.0,D,D,D,D


In [12]:
df6=df5.drop(columns=['area_mean'])
df6.head()

Unnamed: 0,diagnosis,concave_points_worst_bins,perimeter_worst_bins,concave_points_mean_bins,radius_worst_bins,perimeter_mean_bins,area_worst_bins,radius_mean_bins,area_mean_bins
0,1,D,D,D,D,D,D,D,D
1,1,D,D,C,D,D,D,D,D
2,1,D,D,D,D,D,D,D,D
3,1,D,C,D,B,B,B,A,A
4,1,D,D,D,D,D,D,D,D


In [13]:
df6.to_csv("Final_Clean2.csv")