In [7]:
import pandas as pd #to read the dataset and perform data cleaning
import numpy as np #to convert pandas dataframe to numpy array
from sklearn import preprocessing
from sklearn.model_selection import train_test_split #to split dataset into training, validation and test
from sklearn.metrics import accuracy_score, classification_report #to find accuracy, precision and recall metrics
import matplotlib.pyplot as plt #to plot curves

#reading the dataset
fullDataSet = pd.read_csv("/Users/deep/Downloads/CSE574/Projects/Project 1/proj1code/wdbc.csv", header = None)

#dropping the 'ID' column from the dataset
fullDataSet = fullDataSet.drop(fullDataSet.columns[0], axis=1) 

#adding headers to the file for ease-of-use
fullDataSet.columns = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "aa", "ab", "ac", "ad", "ae"]

#mapping Malignant and Benign to 1 and 0
fullDataSet.a = fullDataSet.a.replace('M',1)
fullDataSet.a = fullDataSet.a.replace('B',0)

#partitioning the data into training, validation and test sets into 80%, 20% and 20%
X_train, X_valtest = train_test_split(fullDataSet, test_size=0.2)
X_val, X_test = train_test_split(X_valtest, test_size=0.5)

#separating target values from sets before normalizing the features
Y_train = X_train.a.value
Y_val = X_val.a.values
Y_test = X_test.a.values

#dropping the target values column from each set
X_train = X_train.drop(X_train.columns[0], axis=1)
X_val = X_val.drop(X_val.columns[0], axis=1)
X_test = X_test.drop(X_test.columns[0], axis=1)

#defining normaliize method to convert pandas dataframes to a numpy arrays and normalize
def normalize(dframe):
    Xtrain = dframe.values
    normalizer = preprocessing.MinMaxScaler()
    Xtrain = normalizer.fit_transform(Xtrain)
    return pd.DataFrame(Xtrain)

#normalizing the datasets
X_train = normalize(X_train)
X_val = normalize(X_val)
X_test = normalize(X_test)


#transforming the sets for matrix multiplication and fitting to equation z = wx + b
X_train = X_train.T
X_val = X_val.T
X_test = X_test.T

Y_train = Y_train.reshape(1, Y_train.shape[0])
Y_val = Y_val.reshape(1, Y_val.shape[0])
Y_test = Y_test.reshape(1, Y_test.shape[0])


# ---logistic regression implementation start---

#defining sigmoid function for logistic regression
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

#function to convert sigmoid values to 0 and 1
def predicted_values(p):
    p[p>=0.5] = 1
    p[p<0.5] = 0
    return p

#defining arrays to get training and validation costs after each epoch
losstrack_train = []
losstrack_val = []

#defining arrays to get training and validation accuracy after each epoch
accuracy_train = []
accuracy_val = []

#m is the sample size of training and validation sets
m_train = X_train.shape[1]
m_val = X_val.shape[1]

#initializing the weights for each of 30 parameters with random values and bias with zero
w = np.random.randn(X_train.shape[0], 1)*0.01
b = 0

#hyperparameters for the model
epochs = 10000
learningrate = 0.01

for epoch in range(epochs):
    
    #calculating value of sigmoid for training and validation sets respectively
    z = np.dot(w.T, X_train) + b
    z_val = np.dot(w.T, X_val) + b
    
    p_train = sigmoid(z)
    p_val = sigmoid(z_val)
    
    #calculating costs for each training and validation data set and averaging them out over sample sizes
    cost_train = -np.sum(np.multiply(np.log(p_train), Y_train) + np.multiply((1 - Y_train), np.log(1 - p_train)))/m_train
    cost_val = -np.sum(np.multiply(np.log(p_val), Y_val) + np.multiply((1 - Y_val), np.log(1 - p_val)))/m_val
    
    #storing the cost for each epoch
    losstrack_train.append(np.squeeze(cost_train))
    losstrack_val.append(np.squeeze(cost_val))
    
    #difference between predicted and target values
    dz = p_train-Y_train
    
    #Calculating gradients of weights and bias
    dw = (np.dot(X_train, dz.T))/m_train
    db = (np.sum(dz))/m_train
    
    #updating the weights and bias
    w = w - learningrate * dw
    b = b - learningrate * db
    
    #calculating accuracy for traing and validation sets and storing them in arrays
    p_train = predicted_values(p_train)
    accuracy_train.append(accuracy_score(Y_train[0], p_train[0]))
    
    p_val = predicted_values(p_val)
    accuracy_val.append(accuracy_score(Y_val[0], p_val[0]))
    
# ---logistic regression implementation end---

    
#plotting cost and accuracy curves with respect to epochs
loss_train = plt.plot(losstrack_train, label = 'Training Loss')
loss_val = plt.plot(losstrack_val, label = 'Validation Loss')
plt.title('Epochs:10000, Learning rate: 0.01, Cost Plot')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.show()

acc_train = plt.plot(accuracy_train, label = 'Training Accuracy')
acc_val = plt.plot(accuracy_val, label = 'Validation Accuracy')
plt.title('Epochs:10000, Learning rate: 0.01, Accuracy Plot')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.show()

#running the model on test set and calculating accuracy
z_test = np.dot(w.T, X_test) + b
p_test = sigmoid (z_test)
p_test = predicted_values(p_test)
accuracy_test = accuracy_score(Y_test[0], p_test[0])

#calculating accuracy, precision, recall and fscore for all sets

target_names = ['Benign', 'Malignant']

print '\033[1m' + 'For Training dataset:' + '\033[0m', '\n' '\n', "Accuracy is:", '{:.1%}'.format(accuracy_train[epochs-1])
print classification_report(Y_train[0], p_train[0], target_names = target_names)

print '\033[1m'+ 'For Validation dataset:'+ '\033[0m', '\n' '\n', "Accuracy is:", '{:.1%}'.format(accuracy_val[epochs-1])
print classification_report(Y_val[0], p_val[0], target_names = target_names)

print '\033[1m' + 'For Test dataset:' + '\033[0m', '\n' '\n', "Accuracy is:",'{:.1%}'.format(accuracy_test)
print classification_report(Y_test[0], p_test[0], target_names = target_names)

AttributeError: 'Series' object has no attribute 'value'