# How to deal with missing data

In [1]:
import pandas as pd
import numpy as np

## Dealing with missing data in different formats

In [2]:
# read dataframe
df = pd.read_csv('assessment.csv')

In [3]:
#Drop a rows
df.head()

Unnamed: 0,assessment score 1,assessment score 2
0,37.0,12
1,72.0,9
2,75.0,5
3,79.0,64
4,16.0,1


In [4]:
df.describe()

Unnamed: 0,assessment score 1
count,9996.0
mean,49.433373
std,28.710692
min,0.0
25%,24.0
50%,49.0
75%,74.0
max,99.0


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   assessment score 1  9996 non-null   float64
 1   assessment score 2  10000 non-null  object 
dtypes: float64(1), object(1)
memory usage: 156.4+ KB


In [6]:
df.sample(5, random_state = 70)

Unnamed: 0,assessment score 1,assessment score 2
39,83.0,38
1360,37.0,7
3714,84.0,14
5981,83.0,74
1295,66.0,#


In [7]:
df.loc[df['assessment score 2'].isin(['#'])]

Unnamed: 0,assessment score 1,assessment score 2
239,17.0,#
396,21.0,#
398,43.0,#
417,7.0,#
484,52.0,#
...,...,...
9357,60.0,#
9507,50.0,#
9638,44.0,#
9816,37.0,#


In [8]:
df['assessment score 2'] = df['assessment score 2'].replace({'#': np.nan})
df

Unnamed: 0,assessment score 1,assessment score 2
0,37.0,12
1,72.0,9
2,75.0,5
3,79.0,64
4,16.0,1
...,...,...
9995,44.0,21
9996,71.0,55
9997,80.0,34
9998,11.0,62


In [9]:
df.loc[df['assessment score 2'].isin(['#'])]

Unnamed: 0,assessment score 1,assessment score 2


In [10]:
df.isna().sum()

assessment score 1     4
assessment score 2    92
dtype: int64

## Option 1: drop rows

In [11]:
cleaned_df = df.dropna()
cleaned_df.describe()

Unnamed: 0,assessment score 1
count,9904.0
mean,49.471123
std,28.70159
min,0.0
25%,24.0
50%,49.0
75%,74.0
max,99.0


In [12]:
cleaned_df.isna().sum()

assessment score 1    0
assessment score 2    0
dtype: int64

## Option 2: drop columns

In [13]:
problem_df = pd.read_csv("assessment_problem.csv")
problem_df.head()

Unnamed: 0,assessment score 1,assessment score 2
0,357,
1,514,
2,686,
3,39,
4,963,


In [14]:
problem_df.isna().sum()

assessment score 1       0
assessment score 2    9890
dtype: int64

In [15]:
problem_df_cleaned = problem_df.drop('assessment score 2', axis=1)
problem_df_cleaned.head()

Unnamed: 0,assessment score 1
0,357
1,514
2,686
3,39
4,963


In [16]:
problem_df_cleaned.isna().sum()

assessment score 1    0
dtype: int64

## Option 3: impute NANs

In [17]:
df = pd.read_csv('assessment.csv')
# repace # to nan
df['assessment score 2'] = df['assessment score 2'].replace({'#': np.nan})
# convert 'assessment score 2' data type from object to float
df['assessment score 2'] = df['assessment score 2'].astype(float)

In [18]:
df.isna().sum()

assessment score 1     4
assessment score 2    92
dtype: int64

In [19]:
cleaned_df = df.fillna(df.mean())

In [20]:
cleaned_df.isna().sum()

assessment score 1    0
assessment score 2    0
dtype: int64

In [21]:
t_df = df.copy()
t_df['assessment score 2'] = t_df['assessment score 2'].fillna(
        t_df['assessment score 2'].mean())

In [22]:
t_df.isna().sum()

assessment score 1    4
assessment score 2    0
dtype: int64

In [23]:
# A quick check on the stats after imputing the data
cleaned_df.describe()

Unnamed: 0,assessment score 1,assessment score 2
count,10000.0,10000.0
mean,49.433373,49.570751
std,28.704948,28.782446
min,0.0,0.0
25%,24.0,25.0
50%,49.0,50.0
75%,74.0,74.0
max,99.0,99.0


In [24]:
df.describe()

Unnamed: 0,assessment score 1,assessment score 2
count,9996.0,9908.0
mean,49.433373,49.570751
std,28.710692,28.915779
min,0.0,0.0
25%,24.0,24.0
50%,49.0,50.0
75%,74.0,74.0
max,99.0,99.0


## Option 4: create bins

In [25]:
df['assessment score 1'] = pd.cut(df['assessment score 1'], 4)
df['assessment score 2'] = pd.cut(df['assessment score 2'], 4)

In [26]:
df['assessment score 2'].value_counts()

(49.5, 74.25]      2560
(-0.099, 24.75]    2482
(74.25, 99.0]      2475
(24.75, 49.5]      2391
Name: assessment score 2, dtype: int64

In [27]:
df[df.isnull().any(axis=1)]

Unnamed: 0,assessment score 1,assessment score 2
56,,"(49.5, 74.25]"
152,,"(74.25, 99.0]"
231,,"(-0.099, 24.75]"
239,"(-0.099, 24.75]",
275,,"(24.75, 49.5]"
...,...,...
9357,"(49.5, 74.25]",
9507,"(49.5, 74.25]",
9638,"(24.75, 49.5]",
9816,"(24.75, 49.5]",
