# Prediction of Breast Cancer Diagnosis using Logistic Regression

by: Vince Emmanuel Bive (4th Year BS ECE), Course: COE005 - Prediction and Machine Learning

In [1]:
author = 'qvecbive'

#Import libraries
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [2]:
# Gather Breast cancer dataset

data = pd.read_csv('data.csv')
data_test = pd.read_csv('test.csv')
data_output = pd.read_csv('outputs.csv')

In [3]:
#Check dataset content
data = data[0:500]

In [4]:
#Preparing Data - Checking for NaN Values
checking = data.isnull().sum()
checking

id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_worst             0
compactness_worst            0
concavity_worst              0
concave points_worst         0
symmetry_worst               0
fractal_dimension_worst      0
Unnamed:

In [5]:
#Prepare Data - Removing the Unnamed:32 Column
data_new = data.iloc[:, 0:32]
data_new

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,25.38,17.33,184.60,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.11890
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,24.99,23.41,158.80,1956.0,0.1238,0.1866,0.2416,0.1860,0.2750,0.08902
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,23.57,25.53,152.50,1709.0,0.1444,0.4245,0.4504,0.2430,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,14.91,26.50,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.17300
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,22.54,16.67,152.20,1575.0,0.1374,0.2050,0.4000,0.1625,0.2364,0.07678
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,914333,B,14.87,20.21,96.12,680.9,0.09587,0.08345,0.06824,0.04951,...,16.01,28.48,103.90,783.6,0.1216,0.1388,0.1700,0.1017,0.2369,0.06599
496,914366,B,12.65,18.17,82.69,485.6,0.10760,0.13340,0.08017,0.05074,...,14.38,22.15,95.29,633.7,0.1533,0.3842,0.3582,0.1407,0.3230,0.10330
497,914580,B,12.47,17.31,80.45,480.1,0.08928,0.07630,0.03609,0.02369,...,14.06,24.34,92.82,607.3,0.1276,0.2506,0.2028,0.1053,0.3035,0.07661
498,914769,M,18.49,17.52,121.30,1068.0,0.10120,0.13170,0.14910,0.09183,...,22.75,22.88,146.40,1600.0,0.1412,0.3089,0.3533,0.1663,0.2510,0.09445


In [6]:
#Prepare Data - Test data
checking_test = data_test.isnull().sum()
checking_test

radius_mean                0
texture_mean               0
perimeter_mean             0
area_mean                  0
smoothness_mean            0
compactness_mean           0
concavity_mean             0
concave points_mean        0
symmetry_mean              0
fractal_dimension_mean     0
radius_se                  0
texture_se                 0
perimeter_se               0
area_se                    0
smoothness_se              0
compactness_se             0
concavity_se               0
concave points_se          0
symmetry_se                0
fractal_dimension_se       0
radius_worst               0
texture_worst              0
perimeter_worst            0
area_worst                 0
smoothness_worst           0
compactness_worst          0
concavity_worst            0
concave points_worst       0
symmetry_worst             0
fractal_dimension_worst    0
dtype: int64

In [7]:
#Prepare Data - Input and Output Variables

x_var = data_new.iloc[:, 2:32] #Input Variables
y_var = data_new.iloc[:, 1] # Diagnosis 

In [8]:
#Prepare Data - Split dataset to training and testing data
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_var, y_var, test_size = 0.04)

In [9]:
#Choosing a Model - Logistic Regression
from sklearn.linear_model import LogisticRegression
model_LR = LogisticRegression(max_iter=500)

In [10]:
#Training - Train data with the Chosen Model
from sklearn import preprocessing
from sklearn.datasets import make_classification
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

x_var, y_var = make_classification(random_state=42)
model_adj = make_pipeline(StandardScaler(), model_LR)
model_adj.fit(x_train, y_train)  # apply scaling on training data

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(max_iter=500))])

In [11]:
#Check the accuracy of the model
model_adj.score(x_test,y_test)

1.0

In [12]:
#Prediction - Test the Machine Learning Algorithm
y_pred = model_adj.predict(data_test)
y_pred

array(['B', 'B', 'B', 'B', 'B', 'M', 'M', 'B', 'B', 'M', 'M', 'B', 'B',
       'B', 'M', 'M', 'M', 'M', 'M', 'M'], dtype=object)

In [13]:
diagnosis_num = 0
pred_val = []

while diagnosis_num < 20:
    Value = y_pred[diagnosis_num]
    pred_val.append(Value)
    diagnosis_num += 1

pred_val

['B',
 'B',
 'B',
 'B',
 'B',
 'M',
 'M',
 'B',
 'B',
 'M',
 'M',
 'B',
 'B',
 'B',
 'M',
 'M',
 'M',
 'M',
 'M',
 'M']

In [15]:
#Import the csv file for output
import pandas as pd
output = data_output.iloc[:, 0]
output

0      1
1      2
2      3
3      4
4      5
5      6
6      7
7      8
8      9
9     10
10    11
11    12
12    13
13    14
14    15
15    16
16    17
17    18
18    19
19    20
Name: id, dtype: int64

In [16]:
#Create a new matrix for the output of prediction
df = {'Patient ID': output}
final_df = pd.DataFrame(df)
final_df.insert(loc=1, column='Diagnosis', value=pred_val)
final_df

Unnamed: 0,Patient ID,Diagnosis
0,1,B
1,2,B
2,3,B
3,4,B
4,5,B
5,6,M
6,7,M
7,8,B
8,9,B
9,10,M


In [17]:
#Create the csv file, and check your folder
import csv
final_df.to_csv('bive_output.csv')