# Working with Data

## Reshaping

In [1]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

In [3]:
import webbrowser
web = 'https://github.com/jmportilla/Udemy-notes/blob/master/Lec%2033%20-%20Reshaping.ipynb'
webbrowser.open(web)

True

In [2]:
# Create a Dataframe
# pd.Index

dframe = DataFrame(np.arange(8).reshape(2,-1),index = pd.Index(['LA','SF'],name = 'city'),columns=pd.Index(list('ABCD'),name = 'letter'))
dframe

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [4]:
# Use stack to pivot the columns into the rows
dframe_st = dframe.stack()

#Show
dframe_st

city  letter
LA    A         0
      B         1
      C         2
      D         3
SF    A         4
      B         5
      C         6
      D         7
dtype: int32

In [6]:
# unstack
dframe_st.unstack()

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [8]:
# We can choose which level to unstack by
print(dframe_st.unstack(0))
dframe.unstack(1)

city    LA  SF
letter        
A        0   4
B        1   5
C        2   6
D        3   7


letter  city
A       LA      0
        SF      4
B       LA      1
        SF      5
C       LA      2
        SF      6
D       LA      3
        SF      7
dtype: int32

In [9]:
# Also by which name to unstack by
dframe_st.unstack('letter')

letter,A,B,C,D
city,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
LA,0,1,2,3
SF,4,5,6,7


In [10]:
# Also by which name to unstack by
dframe_st.unstack('city')

city,LA,SF
letter,Unnamed: 1_level_1,Unnamed: 2_level_1
A,0,4
B,1,5
C,2,6
D,3,7


In [11]:
# Let's see how stack and unstack handle NAN

#Make two series
ser1 = Series([0, 1, 2], index=['Q', 'X', 'Y'])
ser2 = Series([4, 5, 6], index=['X', 'Y', 'Z'])

#Concat to make a dframe
dframe = pd.concat([ser1, ser2], keys=['Alpha', 'Beta'])

# Unstack resulting DataFrame
print(dframe)
dframe.unstack()

Alpha  Q    0
       X    1
       Y    2
Beta   X    4
       Y    5
       Z    6
dtype: int64


Unnamed: 0,Q,X,Y,Z
Alpha,0.0,1.0,2.0,
Beta,,4.0,5.0,6.0


In [12]:
# Now stack will filter out NAN by default
dframe.unstack().stack()

Alpha  Q    0.0
       X    1.0
       Y    2.0
Beta   X    4.0
       Y    5.0
       Z    6.0
dtype: float64

In [13]:
# IF we dont want this we can set it to False
dframe.unstack().stack(dropna=False)

Alpha  Q    0.0
       X    1.0
       Y    2.0
       Z    NaN
Beta   Q    NaN
       X    4.0
       Y    5.0
       Z    6.0
dtype: float64

## pivoting

In [14]:
import numpy as np
import pandas as pd
from pandas import DataFrame,Series

In [15]:
import webbrowser
web = 'https://github.com/jmportilla/Udemy-notes/blob/master/Lec%2034%20-%20Pivoting.ipynb'
webbrowser.open(web)

True

In [44]:
# Lets create some data to play with:

# Note: It is not necessary to understand how this dataset was made to understand this Lecture.

#import pandas testing utility
import pandas.util.testing as tm; tm.N = 3

#Create a unpivoted function
def unpivot(frame):
    N, K = frame.shape
    
    data = {'value' : frame.values.ravel('F'),
            'variable' : np.asarray(frame.columns).repeat(N),
            'date' : np.tile(np.asarray(frame.index), K)}
    
    # Return the DataFrame
    return DataFrame(data, columns=['date', 'variable', 'value'])

#Set the DataFrame we'll be using
dframe = unpivot(tm.makeTimeDataFrame())
dframe

Unnamed: 0,date,variable,value
0,2000-01-03,A,-1.139885
1,2000-01-04,A,0.926286
2,2000-01-05,A,-0.910769
3,2000-01-03,B,-0.50852
4,2000-01-04,B,-0.351311
5,2000-01-05,B,-0.855642
6,2000-01-03,C,-0.876595
7,2000-01-04,C,-0.38454
8,2000-01-05,C,0.000364
9,2000-01-03,D,0.486091


In [45]:
# Now let's pivot the data

# First two value spassed are teh row and column indexes, then finally an optional fill value
dframe_piv = dframe.pivot('date','variable','value')

#Show
dframe_piv

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,-1.139885,-0.50852,-0.876595,0.486091
2000-01-04,0.926286,-0.351311,-0.38454,-1.539858
2000-01-05,-0.910769,-0.855642,0.000364,0.577362


In [46]:
dframe2 = DataFrame(np.ones(len(dframe.index)),columns=['value2'])
df = pd.concat([dframe,dframe2],axis = 1)
df

Unnamed: 0,date,variable,value,value2
0,2000-01-03,A,-1.139885,1.0
1,2000-01-04,A,0.926286,1.0
2,2000-01-05,A,-0.910769,1.0
3,2000-01-03,B,-0.50852,1.0
4,2000-01-04,B,-0.351311,1.0
5,2000-01-05,B,-0.855642,1.0
6,2000-01-03,C,-0.876595,1.0
7,2000-01-04,C,-0.38454,1.0
8,2000-01-05,C,0.000364,1.0
9,2000-01-03,D,0.486091,1.0


In [47]:
df.pivot('date','variable','value2')

variable,A,B,C,D
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2000-01-03,1.0,1.0,1.0,1.0
2000-01-04,1.0,1.0,1.0,1.0
2000-01-05,1.0,1.0,1.0,1.0
