# Data Cleaning and Preparation

In [1]:
import numpy as np
import pandas as pd 
PREVIOUS_MAX_ROWS = pd.options.display.max_rows
pd.options.display.max_rows=20
np.random.seed(12345)

In [2]:
import matplotlib.pyplot as plt
plt.rc('figure',figsize=(10,6))
np.set_printoptions(precision=4, suppress=True)

# Handling Missing Data

In [3]:
string_data = pd.Series(['aardvark','artichoke',np.nan,'avocado'])
print(string_data)
string_data.isnull()

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object


0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None
string_data.isnull()

0     True
1    False
2     True
3    False
dtype: bool

# Filtering out Missing Data

In [5]:
from numpy import nan as NA
data = pd.Series([1, NA, 3.4, 9])
data.dropna()

0    1.0
2    3.4
3    9.0
dtype: float64

In [6]:
data[data.notnull()]

0    1.0
2    3.4
3    9.0
dtype: float64

In [7]:
data = pd.DataFrame([[1., 6,5, 3.],[1, NA, NA],[NA, NA, NA], [NA, 6,5, 3.]])
cleaned = data.dropna()
print(data)
print('*'*20)
print(cleaned)

     0    1    2    3
0  1.0  6.0  5.0  3.0
1  1.0  NaN  NaN  NaN
2  NaN  NaN  NaN  NaN
3  NaN  6.0  5.0  3.0
********************
     0    1    2    3
0  1.0  6.0  5.0  3.0


In [8]:
data.dropna(how='all')

Unnamed: 0,0,1,2,3
0,1.0,6.0,5.0,3.0
1,1.0,,,
3,,6.0,5.0,3.0


In [9]:
data[4] = NA
print(data)
data.dropna(axis=1, how='all')

     0    1    2    3   4
0  1.0  6.0  5.0  3.0 NaN
1  1.0  NaN  NaN  NaN NaN
2  NaN  NaN  NaN  NaN NaN
3  NaN  6.0  5.0  3.0 NaN


Unnamed: 0,0,1,2,3
0,1.0,6.0,5.0,3.0
1,1.0,,,
2,,,,
3,,6.0,5.0,3.0


In [10]:
df = pd.DataFrame(np.random.randn(7,3))
df.iloc[:4,1] = NA
df.iloc[:2, 2] = NA
print(df)
print('-'*30)
print(df.dropna())
print('-'*30)
print(df.dropna(thresh=2))

          0         1         2
0 -0.204708       NaN       NaN
1 -0.555730       NaN       NaN
2  0.092908       NaN  0.769023
3  1.246435       NaN -1.296221
4  0.274992  0.228913  1.352917
5  0.886429 -2.001637 -0.371843
6  1.669025 -0.438570 -0.539741
------------------------------
          0         1         2
4  0.274992  0.228913  1.352917
5  0.886429 -2.001637 -0.371843
6  1.669025 -0.438570 -0.539741
------------------------------
          0         1         2
2  0.092908       NaN  0.769023
3  1.246435       NaN -1.296221
4  0.274992  0.228913  1.352917
5  0.886429 -2.001637 -0.371843
6  1.669025 -0.438570 -0.539741


# Flling In Missing Data

In [11]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-0.204708,0.0,0.0
1,-0.55573,0.0,0.0
2,0.092908,0.0,0.769023
3,1.246435,0.0,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [12]:
df.fillna({1:0.5, 2:0})

Unnamed: 0,0,1,2
0,-0.204708,0.5,0.0
1,-0.55573,0.5,0.0
2,0.092908,0.5,0.769023
3,1.246435,0.5,-1.296221
4,0.274992,0.228913,1.352917
5,0.886429,-2.001637,-0.371843
6,1.669025,-0.43857,-0.539741


In [36]:
_ = df.fillna(0, inplace=True)
df

Unnamed: 0,0,1,2
0,0.670216,0.0,0.0
1,-0.023493,0.0,0.0
2,-1.218302,0.0,1.074623
3,0.723642,0.0,1.001543
4,-0.503087,-0.622274,-0.921169
5,-0.726213,0.222896,0.051316
6,-1.157719,0.816707,0.43361
