## Python / Models / Breast cancer dataset

### Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

### Importing and cleaning dataset

In [2]:
# Reading dataset into a pandas dataframe

df = pd.read_csv("BreastCancerWc.csv")
df

Unnamed: 0,1000025,5,1,1.1,1.2,2,1.3,3,1.4,1.5,2.1
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4
...,...,...,...,...,...,...,...,...,...,...,...
693,776715,3,1,1,1,3,2,1,1,1,2
694,841769,2,1,1,1,2,1,1,1,1,2
695,888820,5,10,10,3,7,3,8,10,2,4
696,897471,4,8,6,4,3,4,10,6,1,4


In [3]:
# Redifiing the column names

cols = ["Sample code number", "Clump Thickness", "Uniformity of Cell Size", "Uniformity of Cell Shape", "Marginal Adhesion", "Single Epithelial Cell Size", "Bare Nuclei", "Bland Chromatin", "Normal Nucleoli", "Mitoses", "Class"]
df.columns = cols
df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4
...,...,...,...,...,...,...,...,...,...,...,...
693,776715,3,1,1,1,3,2,1,1,1,2
694,841769,2,1,1,1,2,1,1,1,1,2
695,888820,5,10,10,3,7,3,8,10,2,4
696,897471,4,8,6,4,3,4,10,6,1,4


In [4]:
# Checking for missing values

df.isna().sum()

Sample code number             0
Clump Thickness                0
Uniformity of Cell Size        0
Uniformity of Cell Shape       0
Marginal Adhesion              0
Single Epithelial Cell Size    0
Bare Nuclei                    0
Bland Chromatin                0
Normal Nucleoli                0
Mitoses                        0
Class                          0
dtype: int64

In [5]:
# Replacing '?' with NaN values and dropping the rows with NaN values

df.replace('?', np.nan, inplace = True)
df.dropna(inplace = True)

In [6]:
# Converting columns to numeric

df.apply(pd.to_numeric, errors = 'ignore')
df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,2
1,1015425,3,1,1,1,2,2,3,1,1,2
2,1016277,6,8,8,1,3,4,3,7,1,2
3,1017023,4,1,1,3,2,1,3,1,1,2
4,1017122,8,10,10,8,7,10,9,7,1,4
...,...,...,...,...,...,...,...,...,...,...,...
693,776715,3,1,1,1,3,2,1,1,1,2
694,841769,2,1,1,1,2,1,1,1,1,2
695,888820,5,10,10,3,7,3,8,10,2,4
696,897471,4,8,6,4,3,4,10,6,1,4


In [7]:
# Cleaning outliers

df_numeric = df.apply(pd.to_numeric, errors = 'coerce')
z_scores = np.abs((df_numeric - df_numeric.mean()) / df_numeric.std())
outliers = z_scores > 3
df_cleaned = df[~outliers.any(axis=1)]

In [8]:
# Mapping the class values to 0 and 1

df['Class'] = df['Class'].map({2: 0, 4: 1})
df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1002945,5,4,4,5,7,10,3,2,1,0
1,1015425,3,1,1,1,2,2,3,1,1,0
2,1016277,6,8,8,1,3,4,3,7,1,0
3,1017023,4,1,1,3,2,1,3,1,1,0
4,1017122,8,10,10,8,7,10,9,7,1,1
...,...,...,...,...,...,...,...,...,...,...,...
693,776715,3,1,1,1,3,2,1,1,1,0
694,841769,2,1,1,1,2,1,1,1,1,0
695,888820,5,10,10,3,7,3,8,10,2,1
696,897471,4,8,6,4,3,4,10,6,1,1


In [9]:
numerical_cols = df.select_dtypes(include = np.number).columns

### Data analysis

In [10]:
# Creating two variables for analysis

x = df.drop(columns = ['Sample code number', 'Class'])
y = df['Class']

In [11]:
# Scaling data

scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)
x_scaled

array([[ 0.19805542,  0.27611976,  0.26160925, ..., -0.18196482,
        -0.28587284, -0.34868607],
       [-0.51099338, -0.70298082, -0.74261687, ..., -0.18196482,
        -0.61354557, -0.34868607],
       [ 0.55257982,  1.58158721,  1.6005774 , ..., -0.18196482,
         1.35249082, -0.34868607],
       ...,
       [ 0.19805542,  2.23432094,  2.27006148, ...,  1.85915376,
         2.33550901,  0.22850786],
       [-0.15646898,  1.58158721,  0.93109333, ...,  2.67560119,
         1.02481809, -0.34868607],
       [-0.15646898,  1.58158721,  1.6005774 , ...,  2.67560119,
         0.36947263, -0.34868607]])

In [12]:
# Creating train and test sets

x_train, x_test, y_train, y_test = train_test_split(x_scaled, y, test_size = 0.2, random_state = 42)

In [13]:
# Logistic Regression

model1 = LogisticRegression()
model1.fit(x_train, y_train)
model1.predict(x_test)
print(accuracy_score(y_test, model1.predict(x_test)))

0.9562043795620438


In [14]:
# Naive Bayes

model2 = GaussianNB()
model2.fit(x_train, y_train)
model2.predict(x_test)
print(accuracy_score(y_test, model2.predict(x_test)))

0.9635036496350365
