In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("BreastCancerWc.csv",header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [3]:
df.columns = ['Sample code number', 'Clump Thickness', 'Uniformity of Cell Size','Uniformity of Cell Shape','Marginal Adhesion','Single Epithelial Cell Size','Bare Nuclei','Bland Chromatin','Normal Nucleoli','Mitoses','Class']
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Sample code number           699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitoses                      699 non-null    int64 
 10  Class                        699 non-null    int64 
dtypes: int64(10), object(1)
memory usage: 60.2+ KB


**Data cleaning(Remove NA, ?, Negative values etc.)**

In [5]:
df.isnull().sum().sum()

0

In [6]:
(df.values == np.NaN).sum()

0

In [7]:
(df.values == '?').sum()

16

In [8]:
df = df.replace('?',np.nan)

In [9]:
df.dropna(inplace=True)

In [10]:
(df.values == '?').sum()

0

**Error correcting(Outlier detection and removal)**

In [11]:
df.shape

(683, 11)

In [12]:
df=df.astype(float)

In [15]:
#Using Z-Score

def zscore_filter(data,threshold=3):
  zscore = np.abs((data-data.mean())/data.std())
  out = zscore > 3
  data = data[~out.any(axis=1)]

  return data

df = zscore_filter(df)

In [16]:
df.shape

(630, 11)

**Data Transformation**

In [17]:
from sklearn.preprocessing import LabelEncoder

In [18]:
df['Class'].unique()

array([2., 4.])

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,Y
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,Y
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,Y
3,1016277.0,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,Y
4,1017023.0,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,Y


In [20]:
le = LabelEncoder()
le.fit(df['Class'])
le.classes_

array(['N', 'Y'], dtype=object)

In [21]:
df['Class'] = le.transform(df['Class'])

In [22]:
df.head()

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025.0,5.0,1.0,1.0,1.0,2.0,1.0,3.0,1.0,1.0,1
1,1002945.0,5.0,4.0,4.0,5.0,7.0,10.0,3.0,2.0,1.0,1
2,1015425.0,3.0,1.0,1.0,1.0,2.0,2.0,3.0,1.0,1.0,1
3,1016277.0,6.0,8.0,8.0,1.0,3.0,4.0,3.0,7.0,1.0,1
4,1017023.0,4.0,1.0,1.0,3.0,2.0,1.0,3.0,1.0,1.0,1


**Build Data model using regression and Naïve Bayes methods and compare accuracy of benign and malignant tumors in Breast Cancer Dataset.**

In [24]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [25]:
x = df.drop('Class', axis=1)
y = df['Class']

In [26]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [27]:
reg = LogisticRegression()

In [28]:
reg.fit(x_train,y_train)

In [29]:
reg_pred = reg.predict(x_test)

In [30]:
print("Accuracy by Logistic Regression is: ",accuracy_score(y_test,reg_pred))

Accuracy by Logistic Regression is:  0.9470899470899471


In [31]:
naive = GaussianNB()

In [32]:
naive.fit(x_train,y_train)

In [33]:
naive_pred = naive.predict(x_test)

In [34]:
print("Accuracy by Naive Bayes is: ",accuracy_score(y_test,naive_pred))

Accuracy by Naive Bayes is:  0.783068783068783
