In [5]:
import pandas as pd # Import Pandas library 
import numpy as np # Import Numpy library
#import five_fold_stratified_cv
#import logistic_regression
 
# File name: logistic_regression_driver.py
# Author: Addison Sears-Collins
# Date created: 7/19/2019
# Python version: 3.7
# Description: Driver of the logistic_regression.py program
 
# Required Data Set Format for Disrete Class Values
# Columns (0 through N)
# 0: Instance ID
# 1: Attribute 1 
# 2: Attribute 2
# 3: Attribute 3 
# ...
# N: Actual Class
 
# The logistic_regression.py program then adds 2 additional columns 
# for the test set.
# N + 1: Predicted Class
# N + 2: Prediction Correct? (1 if yes, 0 if no)
 
ALGORITHM_NAME = "Logistic Regression"
SEPARATOR = ","  # Separator for the data set (e.g. "\t" for tab data)
 
def main():
 
    print("Welcome to the " +  ALGORITHM_NAME + " Program!")
    print()
 
    # Directory where data set is located
    #data_path = input("Enter the path to your input file: ") 
    data_path = "data_set.csv"
 
    # Read the full text file and store records in a Pandas dataframe
    pd_data_set = pd.read_csv(data_path, sep=SEPARATOR)
 
    # Show functioning of the program
    trace_runs_file = "trace_runs_file.txt"
    #trace_runs_file = "iris_logistic_regression_trace_runs.txt"
 
    # Open a new file to save trace runs
    outfile_tr = open(trace_runs_file,"w") 
 
    # Testing statistics
    test_stats_file = "test_stats_file.txt" 
    #test_stats_file = "iris_logistic_regression_test_stats.txt"
 
    # Open a test_stats_file 
    outfile_ts = open(test_stats_file,"w")
 
    # The number of folds in the cross-validation
    NO_OF_FOLDS = 5
 
    # Generate the five stratified folds
    fold0, fold1, fold2, fold3, fold4 = get_five_folds(
        pd_data_set)
 
    training_dataset = None
    test_dataset = None
 
    # Create an empty array of length 5 to store the accuracy_statistics 
    # (classification accuracy)
    accuracy_statistics = np.zeros(NO_OF_FOLDS)
 
    # Run Logistic Regression the designated number of times as indicated by the 
    # number of folds
    for experiment in range(0, NO_OF_FOLDS):
 
        print()
        print("Running Experiment " + str(experiment + 1) + " ...")
        print()
        outfile_tr.write("Running Experiment " + str(experiment + 1) + " ...\n")
        outfile_tr.write("\n")
 
        # Each fold will have a chance to be the test data set
        if experiment == 0:
            test_dataset = fold0
            training_dataset = pd.concat([
               fold1, fold2, fold3, fold4], ignore_index=True, sort=False)                
        elif experiment == 1:
            test_dataset = fold1
            training_dataset = pd.concat([
               fold0, fold2, fold3, fold4], ignore_index=True, sort=False) 
        elif experiment == 2:
            test_dataset = fold2
            training_dataset = pd.concat([
               fold0, fold1, fold3, fold4], ignore_index=True, sort=False) 
        elif experiment == 3:
            test_dataset = fold3
            training_dataset = pd.concat([
               fold0, fold1, fold2, fold4], ignore_index=True, sort=False) 
        else:
            test_dataset = fold4
            training_dataset = pd.concat([
               fold0, fold1, fold2, fold3], ignore_index=True, sort=False) 
         
        accuracy, predictions, weights_for_each_class, no_of_instances_test = (
        logistic_regression(training_dataset,test_dataset))
 
        # Print the trace runs of each experiment
        print("Accuracy:")
        print(str(accuracy * 100) + "%")
        print()
        print("Classifications:")
        print(predictions)
        print()
        print("Learned Model:")
        print(weights_for_each_class)
        print()
        print("Number of Test Instances:")
        print(str(no_of_instances_test))
        print() 
 
        outfile_tr.write("Accuracy:")
        outfile_tr.write(str(accuracy * 100) + "%\n\n")
        outfile_tr.write("Classifications:\n")
        outfile_tr.write(str(predictions) + "\n\n")
        outfile_tr.write("Learned Model:\n")
        outfile_tr.write(str(weights_for_each_class) + "\n\n")
        outfile_tr.write("Number of Test Instances:")
        outfile_tr.write(str(no_of_instances_test) + "\n\n")
 
        # Store the accuracy in the accuracy_statistics array
        accuracy_statistics[experiment] = accuracy
 
    outfile_tr.write("Experiments Completed.\n")
    print("Experiments Completed.\n")
 
    # Write to a file
    outfile_ts.write("----------------------------------------------------------\n")
    outfile_ts.write(ALGORITHM_NAME + " Summary Statistics\n")
    outfile_ts.write("----------------------------------------------------------\n")
    outfile_ts.write("Data Set : " + data_path + "\n")
    outfile_ts.write("\n")
    outfile_ts.write("Accuracy Statistics for All 5 Experiments:")
    outfile_ts.write(np.array2string(
        accuracy_statistics, precision=2, separator=',',
        suppress_small=True))
    outfile_ts.write("\n")
    outfile_ts.write("\n")
    accuracy = np.mean(accuracy_statistics)
    accuracy *= 100
    outfile_ts.write("Classification Accuracy : " + str(accuracy) + "%\n")
    
    # Print to the console
    print()
    print("----------------------------------------------------------")
    print(ALGORITHM_NAME + " Summary Statistics")
    print("----------------------------------------------------------")
    print("Data Set : " + data_path)
    print()
    print()
    print("Accuracy Statistics for All 5 Experiments:")
    print(accuracy_statistics)
    print()
    print()
    print("Classification Accuracy : " + str(accuracy) + "%")
    print()
 
    # Close the files
    outfile_tr.close()
    outfile_ts.close()
 
main()

Welcome to the Logistic Regression Program!


Running Experiment 1 ...

Accuracy:
92.19858156028369%

Classifications:
     Bias Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape  \
0       1               0                       0                        0   
1       1               0                       0                        0   
2       1               0                       0                        0   
3       1               0                       0                        0   
4       1               0                       0                        0   
5       1               0                       0                        0   
6       1               0                       0                        0   
7       1               0                       0                        0   
8       1               0                       0                        0   
9       1               0                       0                        0   
10      1              

Accuracy:
95.71428571428572%

Classifications:
     Bias Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape  \
0       1               1                       0                        0   
1       1               0                       0                        0   
2       1               0                       0                        0   
3       1               0                       0                        0   
4       1               0                       0                        0   
5       1               0                       0                        0   
6       1               0                       0                        0   
7       1               0                       0                        0   
8       1               1                       0                        0   
9       1               0                       0                        0   
10      1               0                       0                        0   
11      1        

Accuracy:
97.14285714285714%

Classifications:
     Bias Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape  \
0       1               0                       0                        0   
1       1               0                       0                        0   
2       1               0                       0                        0   
3       1               0                       0                        0   
4       1               1                       0                        0   
5       1               0                       0                        0   
6       1               0                       0                        0   
7       1               0                       0                        0   
8       1               0                       0                        0   
9       1               0                       0                        0   
10      1               0                       0                        0   
11      1        

Accuracy:
94.24460431654677%

Classifications:
     Bias Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape  \
0       1               0                       0                        0   
1       1               0                       0                        0   
2       1               0                       0                        0   
3       1               0                       0                        0   
4       1               1                       0                        0   
5       1               0                       1                        1   
6       1               0                       0                        0   
7       1               0                       0                        0   
8       1               0                       0                        0   
9       1               0                       0                        0   
10      1               0                       0                        0   
11      1        

Accuracy:
95.68345323741008%

Classifications:
     Bias Clump Thickness Uniformity of Cell Size Uniformity of Cell Shape  \
0       1               0                       0                        0   
1       1               0                       0                        0   
2       1               0                       0                        0   
3       1               0                       0                        0   
4       1               1                       0                        0   
5       1               0                       0                        0   
6       1               0                       0                        0   
7       1               0                       0                        0   
8       1               0                       0                        0   
9       1               0                       0                        0   
10      1               0                       0                        0   
11      1        

In [3]:
import pandas as pd # Import Pandas library 
import numpy as np # Import Numpy library
 
# File name: five_fold_stratified_cv.py
# Author: Addison Sears-Collins
# Date created: 7/17/2019
# Python version: 3.7
# Description: Implementation of five-fold stratified cross-validation
# Divide the data set into five random groups. Make sure 
# that the proportion of each class in each group is roughly equal to its 
# proportion in the entire data set.
 
# Required Data Set Format for Disrete Class Values
# Columns (0 through N)
# 0: Instance ID
# 1: Attribute 1 
# 2: Attribute 2
# 3: Attribute 3 
# ...
# N: Actual Class
 
def get_five_folds(instances):
    """
    Parameters:
        instances: A Pandas data frame containing the instances
    Returns: 
        fold0, fold1, fold2, fold3, fold4
        Five folds whose class frequency distributions are 
        each representative of the entire original data set (i.e. Five-Fold 
        Stratified Cross Validation)
    """
    # Shuffle the data set randomly
    instances = instances.sample(frac=1).reset_index(drop=True)
 
    # Record the number of columns in the data set
    no_of_columns = len(instances.columns) # number of columns
 
    # Record the number of rows in the data set
    no_of_rows = len(instances.index) # number of rows
 
    # Create five empty folds (i.e. Panda Dataframes: fold0 through fold4)
    fold0 = pd.DataFrame(columns=(instances.columns))
    fold1 = pd.DataFrame(columns=(instances.columns))
    fold2 = pd.DataFrame(columns=(instances.columns))
    fold3 = pd.DataFrame(columns=(instances.columns))
    fold4 = pd.DataFrame(columns=(instances.columns))
 
    # Record the column of the Actual Class
    actual_class_column = no_of_columns - 1
 
    # Generate an array containing the unique 
    # Actual Class values
    unique_class_list_df = instances.iloc[:,actual_class_column]
    unique_class_list_df = unique_class_list_df.sort_values()
    unique_class_list_np = unique_class_list_df.unique() #Numpy array
    unique_class_list_df = unique_class_list_df.drop_duplicates()#Pandas df
 
    unique_class_list_np_size = unique_class_list_np.size
 
    # For each unique class in the unique Actual Class array
    for unique_class_list_np_idx in range(0, unique_class_list_np_size):
 
        # Initialize the counter to 0
        counter = 0
 
        # Go through each row of the data set and find instances that
        # are part of this unique class. Distribute them among one
        # of five folds
        for row in range(0, no_of_rows):
 
            # If the value of the unique class is equal to the actual
            # class in the original data set on this row
            if unique_class_list_np[unique_class_list_np_idx] == (
                instances.iloc[row,actual_class_column]):
 
                    # Allocate instance to fold0
                    if counter == 0:
 
                        # Extract data for the new row
                        new_row = instances.iloc[row,:]
 
                        # Append that entire instance to fold
                        fold0.loc[len(fold0)] = new_row
                                     
                        # Increase the counter by 1
                        counter += 1
 
                    # Allocate instance to fold1
                    elif counter == 1:
 
                        # Extract data for the new row
                        new_row = instances.iloc[row,:]
 
                        # Append that entire instance to fold
                        fold1.loc[len(fold1)] = new_row
                                     
                        # Increase the counter by 1
                        counter += 1
 
                    # Allocate instance to fold2
                    elif counter == 2:
 
                        # Extract data for the new row
                        new_row = instances.iloc[row,:]
 
                        # Append that entire instance to fold
                        fold2.loc[len(fold2)] = new_row
                                     
                        # Increase the counter by 1
                        counter += 1
 
                    # Allocate instance to fold3
                    elif counter == 3:
 
                        # Extract data for the new row
                        new_row = instances.iloc[row,:]
 
                        # Append that entire instance to fold
                        fold3.loc[len(fold3)] = new_row
                                     
                        # Increase the counter by 1
                        counter += 1
 
                    # Allocate instance to fold4
                    else:
 
                        # Extract data for the new row
                        new_row = instances.iloc[row,:]
 
                        # Append that entire instance to fold
                        fold4.loc[len(fold4)] = new_row
                                     
                        # Reset counter to 0
                        counter = 0
         
    return fold0, fold1, fold2, fold3, fold4

In [4]:
import pandas as pd # Import Pandas library 
import numpy as np # Import Numpy library
  
# File name: logistic_regression.py
# Author: Addison Sears-Collins
# Date created: 7/19/2019
# Python version: 3.7
# Description: Multi-class logistic regression using one-vs-all. 
  
# Required Data Set Format for Disrete Class Values
# Columns (0 through N)
# 0: Instance ID
# 1: Attribute 1 
# 2: Attribute 2
# 3: Attribute 3 
# ...
# N: Actual Class
  
# This program then adds 2 additional columns for the test set.
# N + 1: Predicted Class
# N + 2: Prediction Correct? (1 if yes, 0 if no)
 
def sigmoid(z):
    """
    Parameters:
        z: A real number
    Returns: 
        1.0/(1 + np.exp(-z))
    """
    return 1.0/(1 + np.exp(-z))
 
def gradient_descent(training_set):
    """
    Gradient descent for logistic regression. Follows method presented
    in the textbook Introduction to Machine Learning 3rd Edition by     
    Ethem Alpaydin (pg. 252)
 
    Parameters:
      training_set: The training instances as a Numpy array
    Returns:
      weights: The vector of weights, commonly called w or THETA
    """  
 
    no_of_columns_training_set = training_set.shape[1]
    no_of_rows_training_set = training_set.shape[0]
 
    # Extract the attributes from the training set.
    # x is still a 2d array
    x = training_set[:,:(no_of_columns_training_set - 1)]
    no_of_attributes = x.shape[1]
 
    # Extract the classes from the training set.
    # actual_class is a 1d array.
    actual_class = training_set[:,(no_of_columns_training_set - 1)]
 
    # Set a learning rate
    LEARNING_RATE = 0.01
 
    # Set the maximum number of iterations
    MAX_ITER = 10000
 
    # Set the iteration variable to 0
    iter = 0
 
    # Set a flag to determine if we have exceeded the maximum number of
    # iterations
    exceeded_max_iter = False
 
    # Set the tolerance. When the euclidean norm of the gradient vector 
    # (i.e. magnitude of the changes in the weights) gets below this value, 
    # stop iterating through the while loop
    GRAD_TOLERANCE = 0.001
    norm_of_gradient = None
 
    # Set a flag to determine if we have reached the minimum of the 
    # cost (i.e. error) function.
    converged = False
 
    # Create the weights vector with random floats between -0.01 and 0.01
    # The number of weights is equal to the number of attributes
    weights = np.random.uniform(-0.01,0.01,(no_of_attributes))
    changes_in_weights = None
 
    # Keep running the loop below until convergence on the minimum of the 
    # cost function or we exceed the max number of iterations
    while(not(converged) and not(exceeded_max_iter)):
         
        # Initialize a weight change vector that stores the changes in 
        # the weights at each iteration
        changes_in_weights = np.zeros(no_of_attributes)
 
        # For each training instance
        for inst in range(0, no_of_rows_training_set):
 
            # Calculate weighted sum of the attributes for
            # this instance
            output = np.dot(weights, x[inst,:])
                 
            # Calculate the sigmoid of the weighted sum
            # This y is the probability that this instance belongs
            # to the positive class
            y =  sigmoid(output)
 
            # Calculate difference
            difference = (actual_class[inst] - y)
 
            # Multiply the difference by the attribute vector
            product = np.multiply(x[inst,:], difference)
 
            # For each attribute, update the weight changes 
            # i.e. the gradient vector
            changes_in_weights = np.add(changes_in_weights,product)
         
        # Calculate the step size
        step_size = np.multiply(changes_in_weights, LEARNING_RATE)
 
        # Update the weights vector
        weights = np.add(weights, step_size)
 
        # Test to see if we have converged on the minimum of the error
        # function
        norm_of_gradient = np.linalg.norm(changes_in_weights)
 
        if (norm_of_gradient < GRAD_TOLERANCE):
            converged = True
 
        # Update the number of iterations
        iter += 1
 
        # If we have exceeded the maximum number of iterations
        if (iter > MAX_ITER):
            exceeded_max_iter = True
 
    #For debugging purposes
    #print("Number of Iterations: " + str(iter - 1))
    #print("Norm of the gradient: " + str(norm_of_gradient))
    #print(changes_in_weights)
    #print()
    return weights
 
def logistic_regression(training_set, test_set):
    """
    Multi-class one-vs-all logistic regression
    Parameters:
      training_set: The training instances as a Pandas dataframe
      test_set: The test instances as a Pandas dataframe
    Returns:
      accuracy: Classification accuracy as a decimal
      predictions: Classifications of all the test instances as a 
        Pandas dataframe
      weights_for_each_class: The weight vectors for each class (one-vs-all)
      no_of_instances_test: The number of test instances
    """  
 
    # Remove the instance ID column
    training_set = training_set.drop(
        training_set.columns[[0]], axis=1)
    test_set = test_set.drop(
        test_set.columns[[0]], axis=1)
 
    # Make a list of the unique classes
    list_of_unique_classes = pd.unique(training_set["Actual Class"])
 
    # Replace all the class values with numbers, starting from 0
    # in both the test and training sets.
    for cl in range(0, len(list_of_unique_classes)):
        training_set["Actual Class"].replace(
            list_of_unique_classes[cl], cl ,inplace=True)
        test_set["Actual Class"].replace(
            list_of_unique_classes[cl], cl ,inplace=True)
 
    # Insert a column of 1s in column 0 of both the training
    # and test sets. This is the bias and helps with gradient
    # descent. (i.e. X0 = 1 for all instances)
    training_set.insert(0, "Bias", 1)
    test_set.insert(0, "Bias", 1)
 
    # Convert dataframes to numpy arrays
    np_training_set = training_set.values
    np_test_set = test_set.values
 
    # Add 2 additional columns to the testing dataframe
    test_set = test_set.reindex(
        columns=[*test_set.columns.tolist(
        ), 'Predicted Class', 'Prediction Correct?'])
 
    ############################# Training Phase ##############################
 
    no_of_columns_training_set = np_training_set.shape[1]
    no_of_rows_training_set = np_training_set.shape[0]
 
    # Create and store a training set for each unique class
    # to create separate binary classification
    # problems
    trainingsets = []
    for cl in range(0, len(list_of_unique_classes)):
 
        # Create a copy of the training set
        temp = np.copy(np_training_set)
 
        # This class becomes the positive class 1
        # and all other classes become the negative class 0
        for row in range(0, no_of_rows_training_set):
            if (temp[row, (no_of_columns_training_set - 1)]) == cl:
                temp[row, (no_of_columns_training_set - 1)] = 1
            else:
                temp[row, (no_of_columns_training_set - 1)] = 0
         
        # Add the new training set to the trainingsets list
        trainingsets.append(temp)
 
    # Calculate and store the weights for the training set
    # of each class. Execute gradient descent on each training set
    # in order to calculate the weights
    weights_for_each_class = []
 
    for cl in range(0, len(list_of_unique_classes)):
        weights_for_this_class = gradient_descent(trainingsets[cl])
        weights_for_each_class.append(weights_for_this_class)
 
    # Used for debugging
    #print(weights_for_each_class[0])
    #print()
    #print(weights_for_each_class[1])
    #print()
    #print(weights_for_each_class[2])
 
    ########################### End of Training Phase #########################
 
    ############################# Testing Phase ###############################
 
    no_of_columns_test_set = np_test_set.shape[1]
    no_of_rows_test_set = np_test_set.shape[0]
 
    # Extract the attributes from the test set.
    # x is still a 2d array
    x = np_test_set[:,:(no_of_columns_test_set - 1)]
    no_of_attributes = x.shape[1]
 
    # Extract the classes from the test set.
    # actual_class is a 1d array.
    actual_class = np_test_set[:,(no_of_columns_test_set - 1)]
 
    # Go through each row (instance) of the test data
    for inst in range(0,  no_of_rows_test_set):
 
        # Create a scorecard that keeps track of the probabilities of this
        # instance being a part of each class
        scorecard = []
 
        # Calculate and store the probability for each class in the scorecard
        for cl in range(0, len(list_of_unique_classes)):
 
            # Calculate weighted sum of the attributes for
            # this instance
            output = np.dot(weights_for_each_class[cl], x[inst,:])
 
            # Calculate the sigmoid of the weighted sum
            # This is the probability that this instance belongs
            # to the positive class
            this_probability = sigmoid(output)
 
            scorecard.append(this_probability)
 
        most_likely_class = scorecard.index(max(scorecard))
 
        # Store the value of the most likely class in the "Predicted Class" 
        # column of the test_set data frame
        test_set.loc[inst, "Predicted Class"] = most_likely_class
 
        # Update the 'Prediction Correct?' column of the test_set data frame
        # 1 if correct, else 0
        if test_set.loc[inst, "Actual Class"] == test_set.loc[
            inst, "Predicted Class"]:
            test_set.loc[inst, "Prediction Correct?"] = 1
        else:
            test_set.loc[inst, "Prediction Correct?"] = 0
 
    # accuracy = (total correct predictions)/(total number of predictions)
    accuracy = (test_set["Prediction Correct?"].sum())/(len(test_set.index))
 
    # Store the revamped dataframe
    predictions = test_set
 
    # Replace all the class values with the name of the class
    for cl in range(0, len(list_of_unique_classes)):
        predictions["Actual Class"].replace(
            cl, list_of_unique_classes[cl] ,inplace=True)
        predictions["Predicted Class"].replace(
            cl, list_of_unique_classes[cl] ,inplace=True)
 
    # Replace 1 with Yes and 0 with No in the 'Prediction 
    # Correct?' column
    predictions['Prediction Correct?'] = predictions[
        'Prediction Correct?'].map({1: "Yes", 0: "No"})
 
    # Reformat the weights_for_each_class list of arrays
    weights_for_each_class = pd.DataFrame(np.row_stack(weights_for_each_class))
  
    # Rename the row names
    for cl in range(0, len(list_of_unique_classes)):
        row_name = str(list_of_unique_classes[cl] + " weights")        
        weights_for_each_class.rename(index={cl:row_name}, inplace=True)
 
    # Get a list of the names of the attributes
    training_set_names = list(training_set.columns.values)
    training_set_names.pop() # Remove 'Actual Class'
 
    # Rename the column names
    for col in range(0, len(training_set_names)):
        col_name = str(training_set_names[col])        
        weights_for_each_class.rename(columns={col:col_name}, inplace=True)
 
    # Record the number of test instances
    no_of_instances_test = len(test_set.index)
 
    # Return statement
    return accuracy, predictions, weights_for_each_class, no_of_instances_test

In [26]:
pd_data_set = pd.read_csv("unprocessed.csv", sep=',')

In [40]:
pd_data_set

Unnamed: 0,Instance ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Actual Class
0,1000025,0,0,0,0,0,0,0,0,0,Benign
1,1002945,0,0,0,0,1,0,0,0,0,Benign
2,1015425,0,0,0,0,0,0,0,0,0,Benign
3,1016277,1,1,1,0,0,0,0,1,0,Benign
4,1017023,0,0,0,0,0,0,0,0,0,Benign
5,1017122,1,1,1,1,1,0,1,1,0,Malignant
6,1018099,0,0,0,0,0,0,0,0,0,Benign
7,1018561,0,0,0,0,0,0,0,0,0,Benign
8,1033078,0,0,0,0,0,0,0,0,0,Benign
9,1033078,0,0,0,0,0,0,0,0,0,Benign


In [27]:
pd_data_set['Clump Thickness'] = pd_data_set['Clump Thickness'].apply(lambda x: 0 if x<=5 else 1)
pd_data_set['Uniformity of Cell Size'] = pd_data_set['Uniformity of Cell Size'].apply(lambda x: 0 if x<=5 else 1)
pd_data_set['Uniformity of Cell Shape'] = pd_data_set['Uniformity of Cell Shape'].apply(lambda x: 0 if x<=5 else 1)
pd_data_set['Marginal Adhesion'] = pd_data_set['Marginal Adhesion'].apply(lambda x: 0 if x<=5 else 1)
pd_data_set['Single Epithelial Cell Size'] = pd_data_set['Single Epithelial Cell Size'].apply(lambda x: 0 if x<=5 else 1)


TypeError: '<=' not supported between instances of 'str' and 'int'

In [32]:
pd_data_set['Bare Nuclei'] = pd_data_set['Bare Nuclei'].apply(lambda x: 0 if x <= '5' else 1)
pd_data_set['Bland Chromatin'] = pd_data_set['Bland Chromatin'].apply(lambda x: 0 if x<=5 else 1)
pd_data_set['Normal Nucleoli'] = pd_data_set['Normal Nucleoli'].apply(lambda x: 0 if x<=5 else 1)


KeyError: 'Mitoses'

In [37]:
pd_data_set['Mitoses '] = pd_data_set['Mitoses '].apply(lambda x: 0 if x <= 5 else 1)

In [39]:
pd_data_set['Actual Class'] = pd_data_set['Actual Class'].apply(lambda x: 'Benign' if x==2 else 'Malignant')

In [61]:
pd_data_set.to_csv('preprocessed.csv',index=False)

In [62]:
pd_data_set = pd.read_csv("preprocessed.csv", sep=',')
df = pd.DataFrame(pd_data_set)


In [66]:
df.to_csv('preprocessed.csv',index=False)

In [7]:
df1=pd.read_csv('unprocessed.csv')

In [8]:
df1

Unnamed: 0,Instance ID,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Actual Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
5,1017122,8,10,10,8,7,10,9,7,1,4
6,1018099,1,1,1,1,2,10,3,1,1,2
7,1018561,2,1,2,1,2,1,3,1,1,2
8,1033078,2,1,1,1,2,1,1,1,5,2
9,1033078,4,2,1,1,2,1,2,1,1,2
