# Basic Overview of Pandas

###### Importing pandas (Python Data Analysis Library), and numpy (Scientific Computing package)

In [2]:
import pandas as pd
import numpy as np

###### Creating a Data Frame object, usually this will be read from csv or json

In [3]:
names = ['Jon', 'Matt', 'Sarah', 'Ashley', 'Sam']
ids = [23, 34, 83, 86, 12]
sales = [10.2, 84.3, 72.9, 27.1, 223.1]
department = ['A','A','B','B','B']

users = pd.DataFrame({'id': ids,
                      'name': names,
                      'sales': sales,
                      'dept': department})

###### dataframe.head(n=5) returns the first n rows, default is 5

In [4]:
print(users.head())

  dept  id    name  sales
0    A  23     Jon   10.2
1    A  34    Matt   84.3
2    B  83   Sarah   72.9
3    B  86  Ashley   27.1
4    B  12     Sam  223.1


In [5]:
users = users.reindex(columns=['name','id','dept','sales'])

In [6]:
print(users)

     name  id dept  sales
0     Jon  23    A   10.2
1    Matt  34    A   84.3
2   Sarah  83    B   72.9
3  Ashley  86    B   27.1
4     Sam  12    B  223.1


### What are the total sales per dept?

###### First, we can get a subset of the columns since we only need **sales** and **dept**, by indexing

In [7]:
dept_sales = users[['dept','sales']]
print(dept_sales)

  dept  sales
0    A   10.2
1    A   84.3
2    B   72.9
3    B   27.1
4    B  223.1


###### We want to group by dept and sum the sales

In [8]:
grp_dept_sales = dept_sales.groupby('dept')
print(grp_dept_sales.head())

  dept  sales
0    A   10.2
1    A   84.3
2    B   72.9
3    B   27.1
4    B  223.1


###### What do groups look like?

In [9]:
print(grp_dept_sales.groups)

{'B': [2, 3, 4], 'A': [0, 1]}


In [10]:
for key, value in grp_dept_sales:
    print(key, type(value))
    print(value)

A <class 'pandas.core.frame.DataFrame'>
  dept  sales
0    A   10.2
1    A   84.3
B <class 'pandas.core.frame.DataFrame'>
  dept  sales
2    B   72.9
3    B   27.1
4    B  223.1


###### Apply an aggregate function to the groups, for example sum

In [11]:
print(grp_dept_sales.sum())

      sales
dept       
A      94.5
B     323.1


###### Could have done groupby and sum together:

In [12]:
print(dept_sales.groupby('dept').sum())

      sales
dept       
A      94.5
B     323.1


###### There are other agrregate methods for mean, median, min, max, etc.

In [13]:
print(grp_dept_sales.max())

      sales
dept       
A      84.3
B     223.1


###### You can also pass a list of aggregate function names to agg()

In [15]:
print(grp_dept_sales.agg(['sum', 'count', 'mean','median','min','max','std','var', np.size]))

      sales                                                              
        sum count    mean median   min    max         std        var size
dept                                                                     
A      94.5     2   47.25  47.25  10.2   84.3   52.396612   2745.405  2.0
B     323.1     3  107.70  72.90  27.1  223.1  102.529410  10512.280  3.0
