In [1]:
import pandas as pd

In [2]:
pd.set_option('display.max_columns', None)

In [3]:
df = pd.read_csv("mushrooms_raw.csv", delimiter=";")

In [4]:
df

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,stem-width,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,17.09,s,y,w,u,w,t,g,,d,w
1,p,16.60,x,g,o,f,e,,w,17.99,18.19,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.80,17.74,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,15.98,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,17.20,s,y,w,u,w,t,p,,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,1.18,s,s,y,f,f,f,f,3.93,6.22,,,y,,,f,f,,d,a
61065,p,1.27,f,s,y,f,f,f,f,3.18,5.43,,,y,,,f,f,,d,a
61066,p,1.27,s,s,y,f,f,f,f,3.86,6.37,,,y,,,f,f,,d,u
61067,p,1.24,f,s,y,f,f,f,f,3.56,5.44,,,y,,,f,f,,d,u


In [5]:
def missing_values(df):

    df_isnull_abs = df.isnull().sum(axis=0)
    df_isnull_rel = round(df_isnull_abs / len(df) * 100, 2)

    return pd.concat([df_isnull_abs, df_isnull_rel], axis=1).rename(columns={0: "absolute", 1: "relative"})

In [6]:
missing_values(df)

Unnamed: 0,absolute,relative
class,0,0.0
cap-diameter,0,0.0
cap-shape,0,0.0
cap-surface,14120,23.12
cap-color,0,0.0
does-bruise-or-bleed,0,0.0
gill-attachment,9884,16.18
gill-spacing,25063,41.04
gill-color,0,0.0
stem-height,0,0.0


In [7]:
threshold_drop = 1/5

In [8]:
df_nan_columns = df.dropna(thresh=len(df) * (1-threshold_drop), axis=1)

In [9]:
missing_values(df_nan_columns)

Unnamed: 0,absolute,relative
class,0,0.0
cap-diameter,0,0.0
cap-shape,0,0.0
cap-color,0,0.0
does-bruise-or-bleed,0,0.0
gill-attachment,9884,16.18
gill-color,0,0.0
stem-height,0,0.0
stem-width,0,0.0
stem-color,0,0.0


In [10]:
threshold_prune = 1/10
pruned_attributes = [attribute for attribute, missing in (df_nan_columns.isnull().sum(axis=0) / len(df_nan_columns) > threshold_prune).iteritems() if missing]

In [11]:
df_nan_rows = df_nan_columns.dropna(subset=pruned_attributes)

In [12]:
missing_values(df_nan_rows)

Unnamed: 0,absolute,relative
class,0,0.0
cap-diameter,0,0.0
cap-shape,0,0.0
cap-color,0,0.0
does-bruise-or-bleed,0,0.0
gill-attachment,0,0.0
gill-color,0,0.0
stem-height,0,0.0
stem-width,0,0.0
stem-color,0,0.0


In [13]:
recreated_attributes = [attribute for attribute, missing in (df_nan_rows.isnull().sum(axis=0) / len(df_nan_rows) > 0).iteritems() if missing]

In [14]:
df_nan = df_nan_rows.copy()

In [15]:
import random

In [16]:
def recreate_attribute_categorical(df, attribute):
    """
    We replace all missing values of an attribute by a random value.
    The latter is selected by sampling from the probability distribution induced by the occurences of the existing values.
    """

    column = df[attribute].copy()
    column_full  = column.loc[~pd.isnull(column)]
    column_empty = column.loc[ pd.isnull(column)]

    sample = pd.Series(
        random.choices(
            population = list(column_full.value_counts().index),
            weights = list(column_full.value_counts() / len(column_full)),
            k = len(column_empty)
        )
    )

    sample.index = column.loc[pd.isnull(column)].index
    column.loc[pd.isnull(column)] = sample

    return column

def recreate_attribute_categorical_debug(df, attribute):

    column_old = df[attribute][~pd.isnull(df[attribute])]
    pdf_old = column_old.value_counts(dropna=False) / len(column_old)
    print("pdf old:")
    display(pdf_old)

    column_new = recreate_attribute_categorical(df, attribute)
    pdf_new = column_new.value_counts(dropna=False) / len(column_new)
    print("pdf new:")
    display(pdf_new)

    print("pdf difference:")
    display(abs(pdf_old - pdf_new).sum())

recreate_attribute_categorical_debug(df_nan, "ring-type")

pdf old:


f    0.841727
e    0.042432
z    0.028777
r    0.028512
p    0.021950
l    0.021888
g    0.014715
Name: ring-type, dtype: float64

pdf new:


f    0.841751
e    0.042122
z    0.028837
r    0.028583
p    0.022096
l    0.021764
g    0.014848
Name: ring-type, dtype: float64

pdf difference:


0.0008686222849404665

In [17]:
for attribute in recreated_attributes:
    df_nan[attribute] = recreate_attribute_categorical(df_nan, attribute)

In [18]:
missing_values(df_nan)

Unnamed: 0,absolute,relative
class,0,0.0
cap-diameter,0,0.0
cap-shape,0,0.0
cap-color,0,0.0
does-bruise-or-bleed,0,0.0
gill-attachment,0,0.0
gill-color,0,0.0
stem-height,0,0.0
stem-width,0,0.0
stem-color,0,0.0


In [19]:
print(f"dropped attributes (over {threshold_drop * 100}% missing):")
for attribute, missing in (df.isnull().sum(axis=0) / len(df) < threshold_drop).iteritems():
    if not missing:
        print("-", attribute)
print()

print(f"pruned attributes (over {threshold_prune * 100}% missing):")
for attribute in pruned_attributes:
    print("-", attribute)
print()

print("recreated attributes:")
for attribute in recreated_attributes:
    print("-", attribute)

dropped attributes (over 20.0% missing):
- cap-surface
- gill-spacing
- stem-root
- stem-surface
- veil-type
- veil-color
- spore-print-color

pruned attributes (over 10.0% missing):
- gill-attachment

recreated attributes:
- ring-type


In [20]:
print(f"df: {df.shape} -> df_nan_colums: {df_nan_columns.shape} -> df_nan_rows: {df_nan_rows.shape} -> df_nan: {df_nan.shape}")

df: (61069, 21) -> df_nan_colums: (61069, 14) -> df_nan_rows: (51185, 14) -> df_nan: (51185, 14)
