In [1]:
import pandas as pd
import numpy as np
import sys
%matplotlib inline

# Select
How to select a random sub group of a data frame

In [2]:
# Initial dataframe
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'group2' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'value'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df

Unnamed: 0,group1,group2,value
0,a,1,apple
1,b,2,pear
2,a,3,orange
3,a,4,apple
4,b,1,banana
5,c,3,durian
6,c,5,lemon
7,c,6,lime
8,c,5,raspberry
9,c,4,durian


In [3]:
from random import choice
df.groupby("value").size()

value
apple         2
banana        2
blackberry    1
durian        2
grape         1
guava         1
lemon         2
lime          1
nectarine     1
orange        1
peach         1
pear          1
raspberry     1
dtype: int64

In [4]:
grouped=df.groupby(['group1','group2'])
grouped.size()

group1  group2
a       1         2
        2         1
        3         2
        4         1
b       1         2
        2         2
        3         1
        4         1
c       3         1
        4         1
        5         2
        6         1
dtype: int64

In [5]:
choice?

In [6]:
df.loc[[choice(x) for x in grouped.groups.values()]]

Unnamed: 0,group1,group2,value
10,a,1,peach
11,a,2,nectarine
12,a,3,banana
3,a,4,apple
16,b,1,grape
15,b,2,blackberry
14,b,3,guava
13,b,4,lemon
5,c,3,durian
9,c,4,durian


# How do I slice each row of a column?


In [9]:
df=pd.DataFrame(data=['abcdef']*10,columns=['text']
               )
df

Unnamed: 0,text
0,abcdef
1,abcdef
2,abcdef
3,abcdef
4,abcdef
5,abcdef
6,abcdef
7,abcdef
8,abcdef
9,abcdef


In [10]:
#Selecting the first 2 characters of each row
df['text'].apply(lambda x:x[:2])

0    ab
1    ab
2    ab
3    ab
4    ab
5    ab
6    ab
7    ab
8    ab
9    ab
Name: text, dtype: object

# Selecting rows of dataframe based on complex filter applied to multiple columns

In [19]:
d = {'Dates':[pd.Timestamp('2013-01-02'),
              pd.Timestamp('2013-01-03'),
              pd.Timestamp('2013-01-04')],
     'Num1':[1,2,3],
     'Num2':[-1,-2,-3]}
                 

df = pd.DataFrame(data=d)
df

Unnamed: 0,Dates,Num1,Num2
0,2013-01-02,1,-1
1,2013-01-03,2,-2
2,2013-01-04,3,-3


In [21]:
# where all values in column num1 are positive
positive =df['Num1']>0

#where values in column num2 is equal to -1
negativeOne=df['Num2']==-1

#where values in the column dates are in (1/2/2013 or 1/20/2013)
Dates=df['Dates'].isin(['2013-01-02','2013-01-20'])

df[positive & negativeOne & Dates]



Unnamed: 0,Dates,Num1,Num2
0,2013-01-02,1,-1


### How to get the max value of a group?


In [22]:
df=pd.DataFrame({'col1':['minus','minus','positive','nan'],
                'col2':[10,20,30,40],
                'col3':[-10,-20,30,np.nan]
                })

df

Unnamed: 0,col1,col2,col3
0,minus,10,-10.0
1,minus,20,-20.0
2,positive,30,30.0
3,,40,


In [23]:
df.groupby("col1").apply(lambda x:x.max())

Unnamed: 0_level_0,col1,col2,col3
col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
minus,minus,20.0,-10.0
,,40.0,
positive,positive,30.0,30.0


In [24]:
df.groupby("col1").agg("max")

Unnamed: 0_level_0,col2,col3
col1,Unnamed: 1_level_1,Unnamed: 2_level_1
minus,20,-10.0
,40,
positive,30,30.0


In [25]:
df.groupby("col1").max()

Unnamed: 0_level_0,col2,col3
col1,Unnamed: 1_level_1,Unnamed: 2_level_1
minus,20,-10.0
,40,
positive,30,30.0


In [27]:
df.groupby("col1").apply(lambda x:x.min())

Unnamed: 0_level_0,col1,col2,col3
col1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
minus,minus,10.0,-20.0
,,40.0,
positive,positive,30.0,30.0


# How to select records from one level of a multi-index data frame e

In [28]:
df = pd.DataFrame({'group1' : ["a","b","a","a","b","c","c","c","c",
                            "c","a","a","a","b","b","b","b"],
                'value' : [1,2,3,4,1,3,5,6,5,4,1,2,3,4,3,2,1],
                'group2'  : ["apple","pear","orange","apple",
                            "banana","durian","lemon","lime",
                            "raspberry","durian","peach","nectarine",
                            "banana","lemon","guava","blackberry","grape"]})
df = df.set_index(['group1','group2'])
df

Unnamed: 0_level_0,Unnamed: 1_level_0,value
group1,group2,Unnamed: 2_level_1
a,apple,1
b,pear,2
a,orange,3
a,apple,4
b,banana,1
c,durian,3
c,lemon,5
c,lime,6
c,raspberry,5
c,durian,4


In [29]:
df.index

MultiIndex(levels=[['a', 'b', 'c'], ['apple', 'banana', 'blackberry', 'durian', 'grape', 'guava', 'lemon', 'lime', 'nectarine', 'orange', 'peach', 'pear', 'raspberry']],
           labels=[[0, 1, 0, 0, 1, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, 1, 1], [0, 11, 9, 0, 1, 3, 6, 7, 12, 3, 10, 8, 1, 6, 5, 2, 4]],
           names=['group1', 'group2'])

In [30]:
df.xs("a",level="group1")

Unnamed: 0_level_0,value
group2,Unnamed: 1_level_1
apple,1
orange,3
apple,4
peach,1
nectarine,2
banana,3


In [32]:
df.xs("b",level="group1")

Unnamed: 0_level_0,value
group2,Unnamed: 1_level_1
pear,2
banana,1
lemon,4
guava,3
blackberry,2
grape,1


In [33]:
df.xs("b")

Unnamed: 0_level_0,value
group2,Unnamed: 1_level_1
pear,2
banana,1
lemon,4
guava,3
blackberry,2
grape,1


## How do I reset the index when the index names are the same as the column names?


In [34]:
df = pd.DataFrame({"Name":["Alice", "Bob", "Mallory", "Mallory", "Bob" , "Mallory"] , 
                "City":["Seattle", "Seattle", "Portland", "Seattle", "Seattle", "Portland"]}
               )
df

Unnamed: 0,City,Name
0,Seattle,Alice
1,Seattle,Bob
2,Portland,Mallory
3,Seattle,Mallory
4,Seattle,Bob
5,Portland,Mallory


In [35]:
group=df.groupby(['City','Name'])
s=group.agg("size")
s.add_suffix("_size").reset_index()

Unnamed: 0,City,Name,0
0,Portland_size,Mallory_size,2
1,Seattle_size,Alice_size,1
2,Seattle_size,Bob_size,2
3,Seattle_size,Mallory_size,1
