In [252]:
#Data Cleansing with goal of not removing any rows
#Data From https://archive.ics.uci.edu/ml/datasets/Mammographic+Mass

In [300]:
import pandas as pd
import seaborn as sns

In [277]:
#import data
df = pd.read_csv('D:/DataSets/mammographic_masses.txt', header = None)

In [278]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,5,67,3,5,3,1
1,4,43,1,1,?,1
2,5,58,4,5,3,1
3,4,28,1,1,3,0
4,5,74,1,5,?,1


In [279]:
#create header columns from data description
df.columns = ['BI-RADS assessment','Age' , 'Shape', 'Margin', 'Density','Severity']
df.head()

Unnamed: 0,BI-RADS assessment,Age,Shape,Margin,Density,Severity
0,5,67,3,5,3,1
1,4,43,1,1,?,1
2,5,58,4,5,3,1
3,4,28,1,1,3,0
4,5,74,1,5,?,1


In [280]:
#replace '?' with -1 to change datatypes
df.replace('?', np.nan, inplace=True)
df.head()

Unnamed: 0,BI-RADS assessment,Age,Shape,Margin,Density,Severity
0,5,67,3,5,3.0,1
1,4,43,1,1,,1
2,5,58,4,5,3.0,1
3,4,28,1,1,3.0,0
4,5,74,1,5,,1


In [281]:
df.dtypes

BI-RADS assessment    object
Age                   object
Shape                 object
Margin                object
Density               object
Severity               int64
dtype: object

In [282]:
#changedatypes to float for NA replacement
datatype_dict = {'BI-RADS assessment': float, 
                 'Age': float,
                 'Shape': float,
                 'Margin': float,
                 'Density': float,       
                 'Severity': float}

df = df.astype(datatype_dict)


In [283]:
df.dtypes

BI-RADS assessment    float64
Age                   float64
Shape                 float64
Margin                float64
Density               float64
Severity              float64
dtype: object

In [284]:
# find NA values
df.isnull().sum()

BI-RADS assessment     2
Age                    5
Shape                 31
Margin                48
Density               76
Severity               0
dtype: int64

In [285]:
#total NA values
df.isnull().sum().sum()

162

In [286]:
#Use mean to fill in missing AGE values
df['Age'] = df['Age'].fillna(df['Age'].mean())
df['Age'].head()

0    67.0
1    43.0
2    58.0
3    28.0
4    74.0
Name: Age, dtype: float64

In [287]:
# use KNN multivariate method to fill in rest of data since one column could be corrolate to another
cols = ['BI-RADS assessment','Age','Shape','Margin','Density', 'Severity']
X = df[cols]
X

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
impute_it = IterativeImputer()
array = impute_it.fit_transform(X)

df = pd.DataFrame(array)

In [288]:
df.head()

Unnamed: 0,0,1,2,3,4,5
0,5.0,67.0,3.0,5.0,3.0,1.0
1,4.0,43.0,1.0,1.0,2.883641,1.0
2,5.0,58.0,4.0,5.0,3.0,1.0
3,4.0,28.0,1.0,1.0,3.0,0.0
4,5.0,74.0,1.0,5.0,2.937004,1.0


AttributeError: 'DataFrame' object has no attribute 'unique'

In [289]:
round(df, 0)

Unnamed: 0,0,1,2,3,4,5
0,5.0,67.0,3.0,5.0,3.0,1.0
1,4.0,43.0,1.0,1.0,3.0,1.0
2,5.0,58.0,4.0,5.0,3.0,1.0
3,4.0,28.0,1.0,1.0,3.0,0.0
4,5.0,74.0,1.0,5.0,3.0,1.0
...,...,...,...,...,...,...
956,4.0,47.0,2.0,1.0,3.0,0.0
957,4.0,56.0,4.0,5.0,3.0,1.0
958,4.0,64.0,4.0,5.0,3.0,0.0
959,5.0,66.0,4.0,5.0,3.0,1.0


In [291]:
df.columns = ['BI-RADS assessment','Age' , 'Shape', 'Margin', 'Density','Severity']

In [292]:
#changedatypes to int to remove fractions
datatype_dict = {'BI-RADS assessment': int,
                 'Age': int,
                 'Shape': int,
                 'Margin': int,
                 'Density': int, 
                 'Severity': int}

df = df.astype(datatype_dict)

In [293]:
df.head()

Unnamed: 0,BI-RADS assessment,Age,Shape,Margin,Density,Severity
0,5,67,3,5,3,1
1,4,43,1,1,2,1
2,5,58,4,5,3,1
3,4,28,1,1,3,0
4,5,74,1,5,2,1


In [294]:
df.isna().sum()

BI-RADS assessment    0
Age                   0
Shape                 0
Margin                0
Density               0
Severity              0
dtype: int64

In [None]:
#data cleansing is done!