In [2]:
import pandas as pd

### Fill NA

In [3]:
import numpy as np
np.random.seed(1)
df = pd.DataFrame(np.random.randn(10, 4), columns=['a', 'b', 'c', 'd'])
df

Unnamed: 0,a,b,c,d
0,1.624345,-0.611756,-0.528172,-1.072969
1,0.865408,-2.301539,1.744812,-0.761207
2,0.319039,-0.24937,1.462108,-2.060141
3,-0.322417,-0.384054,1.133769,-1.099891
4,-0.172428,-0.877858,0.042214,0.582815
5,-1.100619,1.144724,0.901591,0.502494
6,0.900856,-0.683728,-0.12289,-0.935769
7,-0.267888,0.530355,-0.691661,-0.396754
8,-0.687173,-0.845206,-0.671246,-0.012665
9,-1.11731,0.234416,1.659802,0.742044


In [4]:
# square root ALL CELLS (NaN == Not a Number)
nan_test = df.apply(np.sqrt)
nan_test

Unnamed: 0,a,b,c,d
0,1.274498,,,
1,0.930273,,1.320913,
2,0.564835,,1.209177,
3,,,1.064786,
4,,,0.20546,0.763423
5,,1.069918,0.949521,0.708868
6,0.949134,,,
7,,0.728255,,
8,,,,
9,,0.484165,1.288333,0.86142


In [24]:
#this is a way for dealing with
nan_test[nan_test.isnull()] = "To Replace"
nan_test

Unnamed: 0,a,b,c,d
0,1.2745,To Replace,To Replace,To Replace
1,0.930273,To Replace,1.32091,To Replace
2,0.564835,To Replace,1.20918,To Replace
3,To Replace,To Replace,1.06479,To Replace
4,To Replace,To Replace,0.20546,0.763423
5,To Replace,1.06992,0.949521,0.708868
6,0.949134,To Replace,To Replace,To Replace
7,To Replace,0.728255,To Replace,To Replace
8,To Replace,To Replace,To Replace,To Replace
9,To Replace,0.484165,1.28833,0.86142


## Question: Can we also do this categorical variables?

## Intro to Regex

In [25]:
# df.d.str.contains()
#nan_test is the data set....the d is a column.  The root is for 
nan_test.d.str.contains('^p')
#nan_test.d.str.contains('To Replace')

0    False
1    False
2    False
3    False
4      NaN
5      NaN
6    False
7    False
8    False
9      NaN
Name: d, dtype: object

## The pd melt function

In [13]:
cheese = pd.DataFrame({'first' : ['John', 'Mary'], 'last' : ['Doe', 'Bo'], 'height' : [5.5, 6.0], 'weight' : [130, 150]})

In [26]:

cheese


Unnamed: 0,first,height,last,weight
0,John,5.5,Doe,130
1,Mary,6.0,Bo,150


In [33]:
pd.melt(cheese, id_vars = ['first', 'last'])

Unnamed: 0,first,last,variable,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [35]:
pd.melt(cheese, id_vars=['first', 'last'], var_name='quantity')

Unnamed: 0,first,last,quantity,value
0,John,Doe,height,5.5
1,Mary,Bo,height,6.0
2,John,Doe,weight,130.0
3,Mary,Bo,weight,150.0


In [15]:
a = pd.melt(cheese, id_vars=['first', 'last'], var_name='quantity') # same as above

In [20]:
a.stack()

0  first         John
   last           Doe
   quantity    height
   value          5.5
1  first         Mary
   last            Bo
   quantity    height
   value            6
2  first         John
   last           Doe
   quantity    weight
   value          130
3  first         Mary
   last            Bo
   quantity    weight
   value          150
dtype: object

### Categorical variables

In [29]:
# Let's use pandas to create Categorical Series. One way is by 
# specifying dtype="category" when constructing a Series:

s = pd.Series(["a","b","c","a"], dtype="category")
s

0    a
1    b
2    c
3    a
dtype: category
Categories (3, object): [a, b, c]

In [27]:
# Another way is to convert an existing Series or column to a 
# category dtype:

df = pd.DataFrame({"A":["a","b","c","a"]})

df["B"] = df["A"].astype('category')
df

Unnamed: 0,A,B
0,a,a
1,b,b
2,c,c
3,a,a


In [31]:
# You can also pass a pandas.Categorical object to a Series 

raw_cat = pd.Categorical(["a","b","c","a"], categories=["b","c","d"],
                          ordered=False)

In [6]:
 s = pd.Series(raw_cat)
 s

0    NaN
1      b
2      c
3    NaN
dtype: category
Categories (3, object): [b, c, d]

In [9]:
s.isnull()

0     True
1    False
2    False
3     True
dtype: bool

In [22]:
# s[s.isnull()] = "To Replace"

### Dummy variables

In [33]:
# Let's use pd.get_dummies to convert categorical variables into dummy 
# variables. First let's create a small DataFrame with categorical variables. 

df = pd.DataFrame({'key': list('bbacab'), 'data1': range(6)})
df

Unnamed: 0,data1,key
0,0,b
1,1,b
2,2,a
3,3,c
4,4,a
5,5,b


In [32]:
# Now, let's convert the categorical variables into dummy variables. 

pd.get_dummies(df['key'])

Unnamed: 0,a,b,c
0,0.0,1.0,0.0
1,0.0,1.0,0.0
2,1.0,0.0,0.0
3,0.0,0.0,1.0
4,1.0,0.0,0.0
5,0.0,1.0,0.0
