In [1]:
import time
from tqdm import *
import random
from math import *
import operator
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 

import seaborn as sns
sns.set(style="white", color_codes=True)

from pandas.plotting import scatter_matrix
from sklearn import preprocessing, cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.neighbors import KNeighborsClassifier

ImportError: cannot import name 'cross_validation' from 'sklearn' (C:\Users\Amrita\anaconda3\lib\site-packages\sklearn\__init__.py)

'''
4. Relevant Information:
   Samples arrive periodically as Dr. Wolberg reports his clinical cases.
   The database therefore reflects this chronological grouping of the data.
   This grouping information appears immediately below, having been removed
   from the data itself:
     Group 1: 367 instances (January 1989)
     Group 2:  70 instances (October 1989)
     Group 3:  31 instances (February 1990)
     Group 4:  17 instances (April 1990)
     Group 5:  48 instances (August 1990)
     Group 6:  49 instances (Updated January 1991)
     Group 7:  31 instances (June 1991)
     Group 8:  86 instances (November 1991)
     -----------------------------------------
     Total:   699 points (as of the donated datbase on 15 July 1992)

   Note that the results summarized above in Past Usage refer to a dataset
   of size 369, while Group 1 has only 367 instances.  This is because it
   originally contained 369 instances; 2 were removed.  The following
   statements summarizes changes to the original Group 1's set of data:

   #####  Group 1 : 367 points: 200B 167M (January 1989)
   #####  Revised Jan 10, 1991: Replaced zero bare nuclei in 1080185 & 1187805
   #####  Revised Nov 22,1991: Removed 765878,4,5,9,7,10,10,10,3,8,1 no record
   #####                  : Removed 484201,2,7,8,8,4,3,10,3,4,1 zero epithelial
   #####                  : Changed 0 to 1 in field 6 of sample 1219406
   #####                  : Changed 0 to 1 in field 8 of following sample:
   #####                  : 1182404,2,3,1,1,1,2,0,1,1,1

5. Number of Instances: 699 (as of 15 July 1992)
6. Number of Attributes: 10 plus the class attribute
7. Attribute Information: (class attribute has been moved to last column)
   #  Attribute                     Domain
   -- -----------------------------------------
   1. Sample code number            id number
   2. Clump Thickness               1 - 10
   3. Uniformity of Cell Size       1 - 10
   4. Uniformity of Cell Shape      1 - 10
   5. Marginal Adhesion             1 - 10
   6. Single Epithelial Cell Size   1 - 10
   7. Bare Nuclei                   1 - 10
   8. Bland Chromatin               1 - 10
   9. Normal Nucleoli               1 - 10
  10. Mitoses                       1 - 10
  11. Class:                        (2 for benign, 4 for malignant)

8. Missing attribute values: 16
   There are 16 instances in Groups 1 to 6 that contain a single missing 
   (i.e., unavailable) attribute value, now denoted by "?".  
9. Class distribution:

   Benign: 458 (65.5%)
   Malignant: 241 (34.5%)
'''

In [2]:
location = r"E:\MYLEARN\2-ANALYTICS-DataScience\datasets\breastcancer_orig.csv"

In [3]:
# load the training data from breast cancer data set
df_training = pd.read_csv(location)

In [4]:
# Handle missing value
df_training.replace('?', -99999, inplace=True)

In [5]:
# remove the code column
df_training.drop(['code'], axis=1, inplace=True)

In [6]:
df_training.shape[0]-1

698

In [7]:
# Extract % samples as test from the training file

pct_test = 20
test_patterns = (df_training.shape[0]-1) * pct_test/100

index_list = random.sample(range(0, df_training.shape[0]-1), int(test_patterns))

In [8]:
# initialize a test dataframe
df_test = pd.DataFrame()
df_test

In [9]:
# copy the n training samples as test & drop the rows from the training df
for elem in index_list:
    df_test = df_test.append(df_training.iloc[elem: elem+1])

# drop the rows from the training set (which r now in test dataframe)
df_training.drop(df_training.index[index_list], inplace=True)     

In [10]:
df_training = df_training.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

print(df_training.shape)
print(df_test.shape)

(560, 10)
(139, 10)


In [11]:
# add a column for 'predicted_class'
df_test['predicated_class'] = ''

In [12]:
df_training['distance'] = 0

# set the value of k number of neighbors to choose.
# the code will execute for k = 1 to specified value of k and store the accuracy % in a temp DF
k = 3

# initialize a test kacc results dataframe
df_kacc = pd.DataFrame()

df_kacc['k'] = 0
df_kacc['accuracy_pct'] = 0

In [None]:
%%time
features_list = ['Clump_Thickness','Cell_Size','Cell_Shape','Adhesion','Epithelial_Cell_Size','Bare_Nuclei',
                 'Bland_Chromatin', 'Normal_Nucleoli','Mitoses']

for k_val in range(1, k+1):
    
    # for each test sample
    for index_test, row_test in df_test.iterrows():

        # for each row in the dataframe, calculate the distance
        for index, row in df_training.iterrows():
            
            # initialize dist_sq 
            dist_sq = 0
            for feature in features_list:
                dist_sq = dist_sq + (float(row_test[feature]) - float(row[feature])) ** 2

            eucDist = sqrt(dist_sq)
            df_training.loc[index, 'distance'] = eucDist

        # sort on distance, ascending.
        df_training.sort_values('distance', ascending=True, inplace=True)

        # select the first k rows, into a new df
        df_training_k = df_training.iloc[0:k_val, :].copy()

        df_training_k_grouped = df_training_k['Class']

        # predicted class
        pred_class = df_training_k_grouped.max()

        # save the predicated class in the test data frame
        df_test.at[index_test, 'predicted_class'] = pred_class
        
    # “accuracy” as a metric. 
    # Accuracy is the ratio of no. of data points correctly classified to total no. of data points.
    # on test data set
    
    # find the mismatch count of class predication
    no_mismatch_class = df_test.loc[~(df_test['predicted_class'] == df_test['Class'])].shape[0]

    # accuracy of prediction
    accuracy_pct = 100-(no_mismatch_class/ df_test.shape[0])*100

    print('For k = {}, Accuracy = {:6.2f} %'.format(k_val, accuracy_pct))
        
    # store the value of k and accuracy %
    df_kacc.at[k_val, 'k'] = k_val
    df_kacc.at[k_val, 'accuracy_pct'] = accuracy_pct

In [None]:
df_kacc

In [None]:
df_kacc.plot.line(x='k', y='accuracy_pct')