In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv("../input/breast-cancer-wisconsin-data/data.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
df.describe(include = "all")

In [None]:
#Finding missing values
df.isnull().sum()

In [None]:
df = df.drop(["Unnamed: 32"],axis = 1)

In [None]:
diag_group = df.groupby("diagnosis")

In [None]:
print("Total number of rows diagnosed malignant: ", diag_group.get_group("M").shape[0])

In [None]:
print("Total number of rows diagnosed benign: ",diag_group.get_group("B").shape[0])

# Feature Selection

In [None]:
df = df.drop(["id"],axis = 1)
#Encoding categorical data values
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
df.diagnosis = le.fit_transform(df.diagnosis)
df.head()

In [None]:
#Finding correlation between the features
import seaborn as sns
corr = df.corr()
sns.heatmap(corr)

In [None]:
#fill true value array with shape of corr.shape[0]
#print(np.full(corr.shape[0],True, dtype=bool))

In [None]:
#Compare correlation between the features and remove features that have correlation higher than 0.9
columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
selected_columns = df.columns[columns]
df = df[selected_columns]

In [None]:
df.shape

In [None]:
#Now we will calculate p-value for every predictor variable in regression model for given dataset and we can conclude from 
#p-value that if p-value is greater than 0.05 then that feature don't have any significant contribution to predict 
#cancer type . Hence the feature will be removed.

X = df.iloc[:,1:]
Y = df.iloc[:,0]
import statsmodels.api as sm
mod = sm.OLS(Y,X)
fii = mod.fit()
p_values = fii.summary2().tables[1]['P>|t|']
p_values = pd.DataFrame({'feature': p_values.index, 'PVal': p_values.values})

In [None]:
p_values

In [None]:
threshold = 0.05
for i in range(len(p_values)):
    if p_values.iloc[i].PVal > threshold:
        df = df.drop(p_values.iloc[i].feature, axis = 1)

In [None]:
df.shape

In [None]:
df.columns

# Outliers  Identification

In [None]:
import seaborn as sns
#radius_mean
sns.boxplot(x = df['concavity_mean'])

In [None]:
outlier = df[df['concavity_mean'] > 0.25]

In [None]:
df = df[df['concavity_mean'] < 0.25]

In [None]:
sns.boxplot(x = df['concavity_mean'])

In [None]:
df.shape

In [None]:
sns.boxplot(x = df['radius_mean'])

In [None]:
outlier = df[df['radius_mean'] > 21]
df = df[df['radius_mean'] < 21]
sns.boxplot(x = df['radius_mean'])

# Machine Learning Application

In [None]:
#splitting the dataset
X = df.drop('diagnosis', axis = 1)
Y = df.diagnosis
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.6, random_state = 0)

In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#Logistic Regression Algorithm 
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
clf = LogisticRegression(random_state = 0)
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
clf_scores = []
cm = confusion_matrix(Y_test, Y_pred)
acc_logreg = accuracy_score(Y_test, Y_pred)
clf_scores.append(acc_logreg * 100)
print(cm)
print(acc_logreg)

In [None]:
#KNN
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

list1 = []
for neighbors in range(1,7):
    clf = KNeighborsClassifier(n_neighbors=neighbors, metric='minkowski')
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    list1.append(accuracy_score(Y_test,Y_pred))
plt.plot(list(range(1,7)), list1)
plt.show()

In [None]:
clf = KNeighborsClassifier(n_neighbors = 6)
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
acc_logreg = accuracy_score(Y_test, Y_pred)
clf_scores.append(acc_logreg * 100)
print(cm)
print(acc_logreg)

In [None]:
#SVM
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for c in [0.5,0.6,0.7,0.8,0.9,1.0]:
    clf = SVC(C = c, random_state=0, kernel = 'rbf')
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    list1.append(accuracy_score(Y_test,Y_pred))
plt.plot([0.5,0.6,0.7,0.8,0.9,1.0], list1)
plt.show()

In [None]:
from sklearn.svm import SVC
clf = SVC(C = 0.8, random_state=0, kernel = 'rbf')
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
acc_logreg = accuracy_score(Y_test, Y_pred)
clf_scores.append(acc_logreg * 100)
print(cm)
print(acc_logreg)

In [None]:
#Decision Tree Classifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score
list1 = []
for leaves in range(2,10):
    clf = DecisionTreeClassifier(max_leaf_nodes = leaves, random_state=0, criterion='entropy')
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)
    list1.append(accuracy_score(Y_test,Y_pred))
#print(mylist)
plt.plot(list(range(2,10)), list1)
plt.show()

In [None]:
clf = DecisionTreeClassifier(max_leaf_nodes = 8, random_state=0, criterion='entropy')
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
cm = confusion_matrix(Y_test, Y_pred)
acc_logreg = accuracy_score(Y_test, Y_pred)
clf_scores.append(acc_logreg * 100)
print(cm)
print(acc_logreg)

In [None]:
clf_scores

In [None]:
import matplotlib.pyplot as plt
x = clf_scores
y = ["LogisticRegression", "KNN", "SVM", "DecisionTree"]
plt.bar( y,x,color=['aqua', 'coral', 'gold', 'orchid'])