# Working with DataFrames

In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
%matplotlib inline
import os
os.getcwd()

'C:\\Users\\Kelley\\PycharmProjects\\Python-Library\\Pandas'

### References

[Official Pandas Documentation](https://pandas.pydata.org/pandas-docs/stable/api.html#dataframe)

[Greg Reda Intro](http://www.gregreda.com/2013/10/26/working-with-pandas-dataframes/)

[R Comparison](https://pandas.pydata.org/pandas-docs/stable/comparison_with_r.html)

[Modern Pandas](https://github.com/TomAugspurger/effective-pandas/blob/master/modern_1_intro.ipynb)

[Pandas Cheat Sheet](http://pandas.pydata.org/Pandas_Cheat_Sheet.pdf)

## Import / Create Data

In [7]:
# Import DF
from_csv = pd.read_csv('../Data/mariano-rivera.csv')
football = pd.read_excel('../football_data.xlsx')

# Create DF w/ Dict
extradata = pd.DataFrame({'year': [2013, 2014, 2013],
        'team': ['Wily Coyotes', None, 'Wily Coyotes'],  # None = NULL
        'wins': [15, 16, 15],
        'losses': [None, 3, None]})  # NaN = NULL for numeric columns

# Create DF w/ list of tuples
moredata = pd.DataFrame.from_records([('Bears', 'brown'), ('Packers', 'green'), ('Lions', 'blue')], 
                                    columns = ['team', 'color'])

# Create DF from Series
notes = pd.Series(["this one wasn't good", "NCR-12345 has a problem", "zip code was at 10101 and ticket #12345", "Bob and bobbert had a good time", "mascot was there", 
                   "had a free drink", "", "Pandas are fun. Mr Bob would know.", "i think this is a fake team", "missing team"])
notes_df = pd.DataFrame(notes)
football

Unnamed: 0,year,team,wins,losses
0,2010,Bears,11,5
1,2011,Bears,8,8
2,2012,Bears,10,6
3,2011,Packers,15,1
4,2012,Packers,11,5
5,2010,Lions,6,10
6,2011,Lions,10,6
7,2012,Lions,4,12


### Concat, .join on index

In [12]:
append_rows = pd.concat([football, extradata], ignore_index = True)  # new table index


join_on_index = append_rows.join(notes_df)
# same as
append_cols = pd.concat([append_rows, notes_df], axis = 1)  # default joins on index




football = join_on_index; football

Unnamed: 0,year,team,wins,losses,0
0,2010,Bears,11,5.0,this one wasn't good
1,2011,Bears,8,8.0,NCR-12345 has a problem
2,2012,Bears,10,6.0,zip code was at 10101 and ticket #12345
3,2011,Packers,15,1.0,Bob and bobbert had a good time
4,2012,Packers,11,5.0,mascot was there
5,2010,Lions,6,10.0,had a free drink
6,2011,Lions,10,6.0,
7,2012,Lions,4,12.0,Pandas are fun. Mr Bob would know.
8,2013,Wily Coyotes,15,,i think this is a fake team
9,2014,,16,3.0,missing team


### Merge (SQL Join)

In [14]:
inner_join = pd.merge(football, moredata)
left_join = pd.merge(football, moredata, on='team', how='left')

f = left_join; f

Unnamed: 0,year,team,wins,losses,0,color
0,2010,Bears,11,5.0,this one wasn't good,brown
1,2011,Bears,8,8.0,NCR-12345 has a problem,brown
2,2012,Bears,10,6.0,zip code was at 10101 and ticket #12345,brown
3,2011,Packers,15,1.0,Bob and bobbert had a good time,green
4,2012,Packers,11,5.0,mascot was there,green
5,2010,Lions,6,10.0,had a free drink,blue
6,2011,Lions,10,6.0,,blue
7,2012,Lions,4,12.0,Pandas are fun. Mr Bob would know.,blue
8,2013,Wily Coyotes,15,,i think this is a fake team,
9,2014,,16,3.0,missing team,


### Explore

In [17]:
f.head()
f.shape  # row/col
f.sample(2)  # random sample rows
f.dtypes
f.isnull()
f.info
f.describe()  # statistics summary
# get columns

Unnamed: 0,year,wins,losses
count,8.0,8.0,8.0
mean,2011.125,9.375,6.625
std,0.834523,3.377975,3.377975
min,2010.0,4.0,1.0
25%,2010.75,7.5,5.0
50%,2011.0,10.0,6.0
75%,2012.0,11.0,8.5
max,2012.0,15.0,12.0


# Dataframe Manipulation

### Select

In [84]:
f[['year', 'team']]

Unnamed: 0,year,team
0,2010,Bears
1,2011,Bears
2,2012,Bears
3,2011,Packers
4,2012,Packers
5,2010,Lions
6,2011,Lions
7,2012,Lions
8,2013,Wily Coyotes
9,2014,


### Filter by Values

In [64]:
f[(f['year'] > 2011) & (f['team'] != 'Bears')]

Unnamed: 0,year,team,wins,losses,0
4,2012,Packers,11,5.0,mascot was there
7,2012,Lions,4,12.0,Pandas are fun. Mr Bob would know.
8,2013,Wily Coyotes,15,,i think this is a fake team
9,2014,,16,3.0,missing team
10,2013,Wily Coyotes,15,,


### Filter duplicate rows

In [72]:
f.drop_duplicates(['year', 'team'])  # based on [optional] columns

Unnamed: 0,year,team,wins,losses,0
0,2010,Bears,11,5.0,this one wasn't good
1,2011,Bears,8,8.0,NCR-12345 has a problem
2,2012,Bears,10,6.0,zip code was at 10101 and ticket #12345
3,2011,Packers,15,1.0,Bob and bobbert had a good time
4,2012,Packers,11,5.0,mascot was there
5,2010,Lions,6,10.0,had a free drink
6,2011,Lions,10,6.0,
7,2012,Lions,4,12.0,Pandas are fun. Mr Bob would know.
8,2013,Wily Coyotes,15,,i think this is a fake team
9,2014,,16,3.0,missing team


### Filter NULLS (None, NaN)

In [81]:
f[f['team'].notnull()]

Unnamed: 0,year,team,wins,losses,0
0,2010,Bears,11,5.0,this one wasn't good
1,2011,Bears,8,8.0,NCR-12345 has a problem
2,2012,Bears,10,6.0,zip code was at 10101 and ticket #12345
3,2011,Packers,15,1.0,Bob and bobbert had a good time
4,2012,Packers,11,5.0,mascot was there
5,2010,Lions,6,10.0,had a free drink
6,2011,Lions,10,6.0,
7,2012,Lions,4,12.0,Pandas are fun. Mr Bob would know.
8,2013,Wily Coyotes,15,,i think this is a fake team
10,2013,Wily Coyotes,15,,


In [None]:
f[f['losses'].isnull()]

Unnamed: 0,year,team,wins,losses,0
8,2013,Wily Coyotes,15,,i think this is a fake team
10,2013,Wily Coyotes,15,,


### Group by

In [23]:
f.groupby('color')['wins'].agg(np.sum)  # group by color, sum wins
f.groupby('team')['wins', 'losses'].agg(np.mean)  # average wins & losses by team

team     color
Bears    brown     9.666667
Lions    blue      6.666667
Packers  green    13.000000
Name: wins, dtype: float64

In [29]:
# more data for grouping
more_categories = pd.DataFrame({'team': ['Bears', 'Lions', 'Packers', 'Wily Coyotes'], 'fan_of': [1,1,0,1]})
more_categories

df = pd.merge(f, more_categories)
df

df.groupby(['fan_of', 'team'])['wins', 'losses'].agg([np.sum, np.mean])
df.groupby(['fan_of', 'team'])['wins', 'losses'].agg({'wins': [np.sum, np.mean], 'losses': np.mean})  # group by 2 cols. agg 2 other cols. map aggregations to cols.

Unnamed: 0_level_0,Unnamed: 1_level_0,wins,wins,losses
Unnamed: 0_level_1,Unnamed: 1_level_1,sum,mean,mean
fan_of,team,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,Packers,26,13.0,3.0
1,Bears,29,9.666667,6.333333
1,Lions,20,6.666667,9.333333
1,Wily Coyotes,30,15.0,
