## Project summary:
- The objective of the project was to build various models and compare their prediction performance based on accuracy.



In [None]:
#Importing essential packages and modules
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

import warnings
warnings.simplefilter('ignore')

# Using preloaded dataset in sklearn

# Data Cleaning and preprocessing 

In [None]:
#Load dataset using pandas
brstcancer_data = pd.read_csv('../input/breast-cancer-dataset/Breast Cancer Data.csv', index_col=False,)

In [None]:
#To print first few values
brstcancer_data.drop('Unnamed: 0', axis=1, inplace=True)
brstcancer_data.head()

In [None]:
#Print shape of the dataset
brstcancer_data.shape

In [None]:
#Check for any missing values
brstcancer_data.isnull().values.any() 

In [None]:
#Checking for missing value counts for each variable
brstcancer_data.isnull().sum()

In [None]:
#Checking overall for missing values
brstcancer_data.isnull().sum().sum() #returns overall sum

In [None]:
#looking at summary using describe
brstcancer_data.describe()

In [None]:
#frequency table
pd.crosstab(index = brstcancer_data['diagnosis'], columns = 'count')

In [None]:
type(brstcancer_data)

In [None]:
brstcancer_data.describe().unstack()

# Violin Plot of features

In [None]:
# Using Violin Plot to check Malignant and Benign cancer Data
import seaborn as sns
data_dia = brstcancer_data['diagnosis']
data = brstcancer_data.drop('diagnosis',axis=1)
data_n_2 = (data - data.mean()) / (data.std())              # standardization
data = pd.concat([brstcancer_data['diagnosis'],data_n_2.iloc[:,0:15]],axis=1)
data = pd.melt(data,id_vars="diagnosis",
                    var_name="features",
                    value_name='value')
plt.figure(figsize=(14,5))
sns.violinplot(x="features", y="value", hue="diagnosis", data=data,split=True, inner="quart")
plt.xticks(rotation=45,fontsize=13)

## Splitting the dataset into train and test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = brstcancer_data.values[:, 1:31]
Y = brstcancer_data.values[:,0]

#transform the class labels from their original string representation (M and B) into integers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(Y)
#we had to do this because strings won't work

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 12)
accuracy = []
#names=[]

# Random Forest

In [None]:
#Initializing the model
model1 = RandomForestClassifier()

In [None]:
#Fitting the model for the train data
model1.fit(x_train, y_train)

In [None]:
#Predicting for the unseen/test data
predict1 = model1.predict(x_test)

In [None]:
#Calculating the accuracy of the model
acc1=accuracy_score(y_test, predict1)
accuracy.append(int(acc1*100))
acc1

# KNN

In [None]:
#Initializing the model
model2 = KNeighborsClassifier()

In [None]:
#Fitting the model for the train data
model2.fit(x_train, y_train)

In [None]:
#Predicting for the unseen/test data
predict2 = model2.predict(x_test)

In [None]:
#Calculating the accuracy of the model
acc2=accuracy_score(y_test, predict2)
accuracy.append(int(acc2*100))
acc2

# SVM 


In [None]:
#Initializing the model
model3 = SVC()

In [None]:
#Fitting the model for the train data
model3.fit(x_train, y_train)

In [None]:
#Predicting for the unseen/test data
predict3 = model3.predict(x_test)

In [None]:
#Calculating the accuracy of the model
acc3=accuracy_score(y_test, predict3)
accuracy.append(int(acc3*100))
acc3

# Comparison

# Conclusion

Model trained on Logistic Regression performed the best in classifying the dataset. The accuracy was found to be 94.15 %

In [None]:
from sklearn.model_selection import cross_val_score, KFold
# Spot-Check Algorithms
models = []
models.append(( 'LR' , LogisticRegression()))
models.append(( 'KNN' , KNeighborsClassifier()))
models.append(( 'RF' , RandomForestClassifier()))
models.append(( 'SVM' , SVC()))

# Test options and evaluation metric
num_folds = 10
num_instances = len(x_train)
seed = 7 
scoring =  'accuracy'

# Test options and evaluation metric
num_folds = 10
num_instances = len(x_train)
seed = 7 
scoring =  'accuracy'
results = []
names = []
for name, model in models:
 #kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)
 kfold = KFold(n_splits=4, random_state=seed, shuffle=False)
 cv_results = cross_val_score(model, x_train, y_train, cv=kfold, scoring=scoring)
 print("accuracies for",name)
 print(cv_results)
 results.append(cv_results)
 names.append(name)
 msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
 print(msg)
print('-> 10-Fold cross-validation accurcay score for the training data for four classifiers') 

In [None]:
import matplotlib.pyplot as pylt 
Index = [1,2,3,4]
name=["Iteration 0","Iteration 1","Interation 2","Iteration 3"]
#accuracy = [10,88,96,66]
pylt.plot(Index,results[0])
pylt.xticks(Index, name, rotation=45)
#plt.yticks([0.0,0.5,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,1])
pylt.yticks(np.arange(0.9, 1.2, 0.1)) 
pylt.ylabel('Accuracy')
pylt.xlabel('Model')
pylt.title('Comparison of accuracy for each Iteration in LR')

In [None]:
import matplotlib.pyplot as pylt 
Index = [1,2,3,4]
name=["Iteration 0","Iteration 1","Interation 2","Iteration 3"]
#accuracy = [10,88,96,66]
pylt.plot(Index,results[1])
pylt.xticks(Index, name, rotation=45)
#plt.yticks([0.0,0.5,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,1])
pylt.yticks(np.arange(0.8, 1.2, 0.1)) 
pylt.ylabel('Accuracy')
pylt.xlabel('Model')
pylt.title('Comparison of accuracy for each Iteration inKNN')

In [None]:
import matplotlib.pyplot as pylt 
Index = [1,2,3,4]
name=["Iteration 0","Iteration 1","Interation 2","Iteration 3"]
#accuracy = [10,88,96,66]
pylt.plot(Index,results[2])
pylt.xticks(Index, name, rotation=45)
#plt.yticks([0.0,0.5,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,1])
pylt.yticks(np.arange(0.9, 1.2, 0.1)) 
pylt.ylabel('Accuracy')
pylt.xlabel('Model')
pylt.title('Comparison of accuracy for each Iteration in RF')

In [None]:
import matplotlib.pyplot as pylt 
Index = [1,2,3,4]
name=["Iteration 0","Iteration 1","Interation 2","Iteration 3"]
#accuracy = [10,88,96,66]
pylt.plot(Index,results[3])
pylt.xticks(Index, name, rotation=45)
#plt.yticks([0.0,0.5,0.10,0.15,0.20,0.25,0.30,0.35,0.40,0.45,0.50,0.55,0.60,0.65,0.70,0.75,0.80,0.85,0.90,0.95,1])
pylt.yticks(np.arange(0.5, 0.9, 0.1)) 
pylt.ylabel('Accuracy')
pylt.xlabel('Model')
pylt.title('Comparison of accuracy for each Iteration in SVM')

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
# Compare Algorithms
fig = plt.figure()
fig.suptitle( 'Algorithm Comparison' )
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

Observation
The results show a similar distribution for all classifiers except SVM which is suggesting low variance.

It is possible the varied distribution of the attributes may have an effect on the accuracy of algorithms such as SVM. We will repeat the above code with a standardized copy of the training dataset.

# standardised data

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
# Standardize the dataset
pipelines = []
pipelines.append(( 'ScaledLR' , Pipeline([( 'Scaler' , StandardScaler()),( 'LR' ,
    LogisticRegression())])))

pipelines.append(( 'ScaledKNN' , Pipeline([( 'Scaler' , StandardScaler()),( 'KNN' ,
    KNeighborsClassifier())])))

pipelines.append(( 'ScaledRF' , Pipeline([( 'Scaler' , StandardScaler()),( 'RF' ,
    RandomForestClassifier())])))
pipelines.append(( 'ScaledSVM' , Pipeline([( 'Scaler' , StandardScaler()),( 'SVM' , SVC())])))

results = []
names = []
for name, model in pipelines:
  #kfold = KFold(n=num_instances, n_folds=num_folds, random_state=seed)
  kfold = KFold(n_splits=4, random_state=seed, shuffle=False)
  cv_results = cross_val_score(model, x_train, y_train, cv=kfold,
      scoring=scoring)
  results.append(cv_results)
  names.append(name)
  msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
  print(msg)

In [None]:
# Compare Algorithms
fig = plt.figure()
fig.suptitle( 'Scaled Algorithm Comparison' )
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

The results show that standardization of the data has lifted the skill of SVM to be the most accurate algorithm tested so far.