### Handling Missing Data

In [14]:
import pandas as pd
import numpy as np
from numpy import nan as NA


string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data.isnull() # Detect nan data. Both NaN and None are treated as NA (not available)

# Method to correct NA
    #string_data.dropna()
    #string_data.fillna()

string_data[string_data.notnull()] # filtering out NA 


data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA], [NA, NA, NA], [NA, 6.5, 3.]])
data.columns = ['a','b','c']

data.dropna(how ='all', axis = 1) # only drop rows where all values are NA (for columns)
data.fillna({'a':99,'c':-99}) # fill the missing values with specific values using dictionary for each column
data.fillna(method = 'ffill', limit = 1) # Forward fill the values, but just one!

Unnamed: 0,a,b,c
0,1.0,6.5,3.0
1,1.0,6.5,3.0
2,1.0,,
3,,6.5,3.0


### Data Transformation

##### Removing Duplicates

In [23]:
data = pd.DataFrame({'k1': ['one', 'two'] * 3 + ['two'],
....: 'k2': [1, 1, 2, 3, 3, 4, 4]})

data.duplicated() # Test whether a row has been duplicated
data.drop_duplicates('k1', keep = 'last') # Drop all duplicated by k1 columns

Unnamed: 0,k1,k2
4,one,3
6,two,4


##### Transforming Data using a function or mapping

In [27]:
data = pd.DataFrame({'food': ['bacon', 'pulled pork', 'bacon',
....: 'Pastrami', 'corned beef', 'Bacon',
....: 'pastrami', 'honey ham', 'nova lox'],
....: 'ounces': [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

LC = data['food'].str.lower() # Transform all strings to lowercase

meat_to_animal = {
'bacon': 'pig',
'pulled pork': 'pig',
'pastrami': 'cow',
'corned beef': 'cow',
'honey ham': 'pig',
'nova lox': 'salmon'
}

data['animal'] = LC.map(meat_to_animal) # map accepts a dict containing a mapping

##### Replacing Values

In [29]:
data.replace([4,6], [10000,-10000])

Unnamed: 0,food,ounces,animal
0,bacon,10000.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,Pastrami,-10000.0,cow
4,corned beef,7.5,cow
5,Bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,-10000.0,salmon


##### Renaming Axis 

In [34]:
data = pd.DataFrame(np.arange(12).reshape((3, 4)),
....: index=['Ohio', 'Colorado', 'New York'],
....: columns=['one', 'two', 'three', 'four'])

transform = lambda x: x[:3].upper()
data.index = data.index.map(transform)

data.rename(columns = str.upper) # Renaming is the cleansest way

Unnamed: 0,ONE,TWO,THREE,FOUR
OHI,0,1,2,3
COL,4,5,6,7
NEW,8,9,10,11


##### Discretization and Binning

In [42]:
ages = [20, 22, 25, 27, 21, 23, 37, 31, 61, 45, 41, 32]
bins = [18, 25, 35, 60, 100]

cats = pd.cut(ages, bins, labels = ['a','b','c','d']) # Returns a categorical object, array of strings
cats.codes
cats.categories

cats2 = pd.qcut(ages, 4) # cuts into quantiles



##### Detecting and Filtering Outliers

In [55]:
np.random.seed(100)
data = pd.DataFrame(np.random.randn(1000, 4))

data[(np.abs(data) > 3).any(1)] # Gives only the values which are true (by rows)
np.sign(data) # All positive = 1, negative = -1

Unnamed: 0,0,1,2,3
0,-1.0,1.0,1.0,-1.0
1,1.0,1.0,1.0,-1.0
2,-1.0,1.0,-1.0,1.0
3,-1.0,1.0,1.0,-1.0
4,-1.0,1.0,-1.0,-1.0
5,1.0,1.0,-1.0,-1.0
6,1.0,1.0,1.0,1.0
7,-1.0,1.0,1.0,-1.0
8,-1.0,1.0,1.0,-1.0
9,1.0,-1.0,-1.0,-1.0


##### Permutation and Random Sampling

In [58]:
data.sample(n=3) # Random sampling without replacement
data.sample(n=10, replace = True) # With replacement

Unnamed: 0,0,1,2,3
533,1.110187,1.721231,1.642727,0.426041
406,-0.834704,0.898382,-1.100488,0.633871
423,1.389714,1.183656,-1.23787,2.135546
92,-2.131315,0.263077,-1.271058,0.9291
194,-0.079366,-0.318285,0.919544,-0.561177
931,-0.337503,-0.390401,1.146488,0.132778
103,1.487894,-0.357817,0.356431,1.031282
426,-0.015274,-2.272439,2.570765,-0.055599
718,-0.412299,-2.313658,0.332495,0.16931
335,1.397031,-0.861922,-0.115156,0.010537
