In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df['Unnamed: 32']

In [None]:
#drop column because it's null column
df = df.drop("Unnamed: 32", axis=1)

In [None]:
df.head(3)

In [None]:
df.columns

In [None]:
#drop ID column because it's not of any use in prediction

df.drop('id', axis=1, inplace=True)


In [None]:
df.columns

In [None]:
type(df.columns)

In [None]:
l = list(df.columns)
print(l)

In [None]:
features_mean = l[1:11]

features_se = l[11:21]

features_worst = l[21:]

In [None]:
print(features_mean)

In [None]:
print(features_se)

In [None]:
print(features_worst)

In [None]:
df.head(2)

In [None]:
df['diagnosis'].unique()
# M= Malignant, B= Benign

In [None]:
sns.countplot(df['diagnosis'], label="Count",);

In [None]:
df['diagnosis'].value_counts()

In [None]:
df.shape

# Explore the data

In [None]:
df.describe()
# summary of all the numeric columns

In [None]:
len(df.columns)

In [None]:
# Correlation Plot
corr = df.corr()
corr

In [None]:
corr.shape

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(corr);

In [None]:
df.head()

In [None]:
from sklearn.preprocessing import LabelEncoder

encode = LabelEncoder()
df['diagnosis'] = pd.DataFrame(encode.fit_transform(df['diagnosis']))

In [None]:
df.head()

In [None]:
df['diagnosis'].unique()

In [None]:
X = df.drop('diagnosis', axis=1)
X.head(3)

In [None]:
y = df.diagnosis
y.head(3)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
df.shape

In [None]:
X_train.shape

In [None]:
X_test.shape

In [None]:
y_train.shape

In [None]:
y_test.shape

In [None]:
X_train.head(1)

In [None]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
X_train = ss.fit_transform(X_train)
X_test = ss.transform(X_test)

In [None]:
X_train

# Machine Learning Models

# Logistic Regression

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression( random_state = 0)
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_test)

In [None]:
lr_acc = accuracy_score(y_test, y_pred)
print(lr_acc)


In [None]:
results = pd.DataFrame()
results

In [None]:
tempResults = pd.DataFrame({'Algorithm':['Logistic Regression Method'], 'Accuracy':[lr_acc]})
results = pd.concat( [results, tempResults] )
results = results[['Algorithm','Accuracy']]
results

# Decision Tree Classifier

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

param_grid = { 'criterion': ['gini', 'entropy'] }

dtc = DecisionTreeClassifier(random_state = 0)
CV_dtc = GridSearchCV(estimator=dtc, param_grid=param_grid, cv= 5)


CV_dtc.fit(X_train, y_train)

In [None]:
y_pred = CV_dtc.predict(X_test)

In [None]:
dtc_acc = accuracy_score(y_test, y_pred)
print(dtc_acc)

In [None]:
tempResults = pd.DataFrame({'Algorithm':['Decision tree Classifier Method'], 'Accuracy':[dtc_acc]})
results = pd.concat( [results, tempResults] )
results = results[['Algorithm','Accuracy']]
results

# Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier


rfc = RandomForestClassifier(random_state = 1) 

param_grid = { 'n_estimators': [20, 50]
             }

CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)

CV_rfc.fit(X_train, y_train)

In [None]:
y_pred = CV_rfc.predict(X_test)

In [None]:
rfc_acc = accuracy_score(y_test, y_pred)
print(rfc_acc)

In [None]:
tempResults = pd.DataFrame({'Algorithm':['Random Forest Classifier Method'], 'Accuracy':[rfc_acc]})
results = pd.concat( [results, tempResults] )
results = results[['Algorithm','Accuracy']]
results

# Support Vector Classifier

In [None]:
from sklearn import svm
svc = svm.SVC(random_state = 0)
svc.fit(X_train,y_train)

In [None]:
y_pred = svc.predict(X_test)

In [None]:
svc_acc = accuracy_score(y_test, y_pred)
print(svc_acc)

In [None]:
tempResults = pd.DataFrame({'Algorithm':['Support Vector Classifier Method'], 'Accuracy':[svc_acc]})
results = pd.concat( [results, tempResults] )
results = results[['Algorithm','Accuracy']]
results

# ADA Boost Classifier

In [None]:
from sklearn.ensemble import AdaBoostClassifier

adb = AdaBoostClassifier(random_state = 0)

param_grid = { 'n_estimators': [20,50,70,100] }


CV_adb = GridSearchCV(estimator=adb, param_grid=param_grid, cv= 5)


CV_adb.fit(X_train, y_train)


In [None]:
CV_adb.predict(X_test)

In [None]:
adb_acc = accuracy_score(y_test, y_pred)
print(adb_acc)

In [None]:
tempResults = pd.DataFrame({'Algorithm':['Adaboost Classifier Method'], 'Accuracy':[adb_acc]})
results = pd.concat( [results, tempResults] )
results = results[['Algorithm','Accuracy']]
results