# 4 | Data Structures and Transformation (Pandas)

   > **`pandas`** is a Python package providing fast, flexible, and expressive data structures designed to make working with “relational” or “labeled” data both easy and intuitive.

Some basic topics we cover here include:
   - Series vs DataFrame
   - Slicing and indexing  
   - Joining DFs
   - Filtering and selection   
   - Aggregation and Groupby

*Resources*:
- **Coursera Python for Informatics** (University of Michigan)
- **Coursera Applied Data Science in Python** (University of Michigan)
- **Python for Data Analytics** - Mckinney (O'Reilly)

In [2]:
import numpy as np
import pandas as pd

### Series vs Dataframe

In [3]:
#return list to series (array with index)
animals = ['Tiger', 'Bear', 'Moose']
pd.Series(animals)

0    Tiger
1     Bear
2    Moose
dtype: object

In [5]:
#pandas will override what is in your dictionary key list
#to  us iloc, to , use lock attribute
sports = {'Basketball': 'USA',
          'Golf': 'Scotland',
          'Sumo': 'Japan',
          'Soccer': 'Germany'}
s=pd.Series(sports)
#querry by number
print(s.iloc[3])
#querry by attribute
print(s.loc['Golf'])

Germany
Scotland


In [57]:
# create and summarize DF from nested list in dict
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
        'year': [2000, 2001, 2002, 2001, 2002, 2003],
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
# add column to df
frame = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'],
                      index=['one', 'two', 'three', 'four',
                             'five', 'six'])
frame.head()

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,


In [59]:
frame.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, one to six
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   year    6 non-null      int64  
 1   state   6 non-null      object 
 2   pop     6 non-null      float64
 3   debt    0 non-null      object 
dtypes: float64(1), int64(1), object(2)
memory usage: 240.0+ bytes


In [60]:
# two ways to select attributes
frame['state']

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

In [61]:
# two ways to code attributes
frame.state

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object

### Slicing and indexing

In [11]:
#reindex df
frame = pd.DataFrame(np.arange(9).reshape((3, 3)),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])
frame
frame2 = frame.reindex(['a', 'b', 'c', 'd'])
frame2

Unnamed: 0,Ohio,Texas,California
a,0.0,1.0,2.0
b,,,
c,3.0,4.0,5.0
d,6.0,7.0,8.0


In [24]:
#return row value by index
obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])
obj['b']

1.0

In [28]:
# or by value
obj[1]

1.0

In [29]:
# combined indices
obj['b':'c']

b    1.0
c    2.0
dtype: float64

In [31]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
# return only specific columns
data[['three', 'one']]

Unnamed: 0,three,one
Ohio,2,0
Colorado,6,4
Utah,10,8
New York,14,12


In [51]:
#sort df
frame = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['three', 'one'],
                     columns=['d', 'a', 'b', 'c'])
frame.sort_index() #on row

Unnamed: 0,d,a,b,c
one,4,5,6,7
three,0,1,2,3


In [52]:
#or on column descending
frame.sort_index(axis=1, ascending=False)

Unnamed: 0,d,c,b,a
three,0,3,2,1
one,4,7,6,5


In [53]:
#or by column value
frame.sort_values(by='b')

Unnamed: 0,d,a,b,c
three,0,1,2,3
one,4,5,6,7


# Filtering and selection

In [32]:
#select based on condition
data[data['three'] > 5]

Unnamed: 0,one,two,three,four
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [33]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int64

In [None]:
data.iloc[2, [3, 0, 1]]
data.iloc[2]
data.iloc[[1, 2], [3, 0, 1]]

In [None]:
data.loc[:'Utah', 'two']
data.iloc[:, :3][data.three > 5]

In [None]:
# concat series
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([-2.1, 3.6, -1.5, 4, 3.1],
               index=['a', 'c', 'e', 'f', 'g'])
s1 + s2

### Join DFs

In [None]:
#concat dfs
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])
df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])
df1 + df2

In [None]:
#merge a data frame with boolean mask to index data into a new framework
staff_df = pd.DataFrame([{'Name': 'Kelly', 'Role': 'Director'},
                       {'Name': 'Sally', 'Role': 'Liason'},
                       {'Name': 'James', 'Role': 'Grader'}])
staff_df = staff_df.set_index('Name')
student_df = pd.DataFrame([{'Name': 'Mike', 'School': 'Law'},
                       {'Name': 'Sally', 'School': 'Engineering'},
                       {'Name': 'James', 'School': 'Business'}])
student_df = student_df.set_index('Name')

print(staff_df)
print()  
print(student_df)

#generate summary tables
staff_df = staff_df.reset_index()
student_df = student_df.reset_index()
pd.merge(staff_df, student_df, how='left', left_on='Name', right_on='Name') #student df
pd.merge(staff_df, student_df, how='right', left_on='Name', right_on='Name') #staff df
pd.merge(staff_df, student_df, how='outer', left_on='Name', right_on='Name') #either df
pd.merge(staff_df, student_df, how='inner', left_index=True, right_index=True) #both df


In [None]:
#fill values
df2.loc[1, 'b'] = np.nan


In [None]:
df1.add(df2, fill_value=0)

In [None]:
### apply loop to row of df

In [None]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'),
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
frame
np.abs(frame)

In [None]:
f = lambda x: x.max() - x.min()
frame.apply(f)

In [None]:
frame.apply(f, axis='columns')

In [None]:
def f(x):
    return pd.Series([x.min(), x.max()], index=['min', 'max'])
frame.apply(f)

In [None]:
format = lambda x: '%.2f' % x
frame.applymap(format)

In [None]:
frame['e'].map(format)

In [None]:
#method chaining 
#every object returns a reference on that objects

#traditional method
df = df[df['SUMLEV'] == 50]
df.set_index(['STNAME','CTYNAME'], inplace=True)
df.rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'})

#pandorable method
(df.where(df['SUMLEV'] == 50)
    .dropna()
    .set_index(['STNAME', "CTYNAME"])
    .rename(columns={'ESTIMATESBASE2010': 'Estimates Base 2010'}))

print(df.drop(df[df['Quantity'] == 0].index)
    .rename(columns={'Weight': 'Weight (oz.)'}))


### Aggregation

In [None]:
#group and manipulate df
def min_max(row):
        data = row[['POPESTIMATE2010',
                    'POPESTIMATE2011',
                    'POPESTIMATE2012',
                    'POPESTIMATE2013',
                    'POPESTIMATE2014',
                    'POPESTIMATE2015']]
        return pd.Series({'min': np.min(data), 'max': np.max(data)})
df.apply(min_max, axis=1) #applies across all rows

#with lambdas  
rows = ['POPESTIMATE2010',
         'POPESTIMATE2011',
         'POPESTIMATE2012',          
         'POPESTIMATE2013',
         'POPESTIMATE2014',
         'POPESTIMATE2015']
df.apply(lambda x: np.max(x[rows]), axis=1)

#group functions (more efficient than loops to group data by a column)
%%timeit -n 10 #reduce data frame and calculate the average time it takes
for group, frame in df.groupby('STNAME'):
    avg = np.average(frame['CENSUS2010POP'])
    print('Counties in state ' + group + ' have an average population of ' + str(avg))

#segment data frame by distributing tasks
df = df. set_index('STNAME')
def fun(item):
    if item[0] < 'M':
        return 0
    if item[0] < 'Q':
        return 1
    return 2

for group, frame in df.groupby(fun):
    print('There are ' + str(len(frame)) + ' records in group ' +
    str(group) + ' for processing.')

#aggregate method by df or series
df = pd.read_csv('/Users/wiseer85/Desktop/census.csv')
df = df[df['SUMLEV'] == 50]

df.groupby('STNAME').agg({'CENSUS2010POP': np.average})       

(df.set_index('STNAME').groupby(level=0)['POPESTIMATE2010', 'POPESTIMATE2011'].agg({'avg': np.average, 'sum': np.sum}))


In [None]:

#scales
#ratio scales: units are equally spaced, mathematical operations are valid, eg height or weight
#interval scales: units are equally spaced, but there is no true zero
#ordinal scale: order of units is important, but not spacing, eg letter grades
#nominal scales: categories of data without order, eg sports teams

#recast as ordered categorical data
s = pd.Series(['Low', 'Low', 'High', 'Medium', 'Low', 'High', 'Low'])
s.astype('category', categories=['Low', 'Medium', 'High'], ordered=True)

#get.dummy turns categorical info into ones and zeros

#
df = pd.read_csv('/Users/wiseer85/Desktop/census.csv')
df = df[df['SUMLEV'] == 50]
df = df.set_index('STNAME').groupby(level=0)['CENSUS2010POP'].agg({'avg': np.average})   
pd.cut(df['avg'], 10)

# You can also add labels for the sizes [Small < Medium < Large].
s = pd.Series([168, 180, 174, 190, 170, 185, 179, 181, 175, 169, 182, 177, 180, 171])
pd.cut(s, 3)
pd.cut(s, 3, labels=['Small', 'Medium', 'Large'])

#pivot tables allows comparison of columns against rows
df = pd.read_csv('/Users/wiseer85/Desktop/cars.csv')
df.pivot_table(values='(kW)', index='YEAR', columns='Make', aggfunc=np.mean)
#print(pd.pivot_table(Bikes, index=['Manufacturer','Bike Type']))

