# Pandas - Operations

In [1]:
import numpy as np
import pandas as pd

## Sample DataSet

In [21]:
d = {
    'Name'  : ['Wanda','Eric','John','Andy','Deborah','Charles','James'],
    'Sex': ['Female', 'Male', 'Male', 'Male','Female', 'Male','Male'],
    'City'  : ['Stamford', 'Stamford', 'New York', 'Philadelphia', 'Philadelphia','Stamford','New York'],
    'Frequency': ['Weekly', 'Daily','Weekly', 'Monthly', 'Daily', 'Weekly','Daily'],
    'Item'     : ['Burger', 'Chalupa', 'Sushi','Ice Cream', 'Chalupa', 'Sushi','Donut'],
    'Spend'    : [15,10,42,11,23,93,37]
}

In [22]:
df = pd.DataFrame(d)

In [23]:
df

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15
1,Eric,Male,Stamford,Daily,Chalupa,10
2,John,Male,New York,Weekly,Sushi,42
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11
4,Deborah,Female,Philadelphia,Daily,Chalupa,23
5,Charles,Male,Stamford,Weekly,Sushi,93
6,James,Male,New York,Daily,Donut,37


### .head() and .tail()

In [24]:
df.head() # by default first 5

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15
1,Eric,Male,Stamford,Daily,Chalupa,10
2,John,Male,New York,Weekly,Sushi,42
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11
4,Deborah,Female,Philadelphia,Daily,Chalupa,23


In [25]:
df.tail() # by default last 5

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend
2,John,Male,New York,Weekly,Sushi,42
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11
4,Deborah,Female,Philadelphia,Daily,Chalupa,23
5,Charles,Male,Stamford,Weekly,Sushi,93
6,James,Male,New York,Daily,Donut,37


In [26]:
df.head(3) # providing a number as parameter will display the number of rows for head and tail

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend
0,Wanda,Female,Stamford,Weekly,Burger,15
1,Eric,Male,Stamford,Daily,Chalupa,10
2,John,Male,New York,Weekly,Sushi,42


In [27]:
df.tail(3) # providing a number as parameter will display the number of rows for head and tail

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend
4,Deborah,Female,Philadelphia,Daily,Chalupa,23
5,Charles,Male,Stamford,Weekly,Sushi,93
6,James,Male,New York,Daily,Donut,37


### .unique

In [34]:
df['Sex'].unique() # usefull in terms of identifying unique values

array(['Female', 'Male'], dtype=object)

In [35]:
df['City'].unique() # usefull in terms of identifying unique values

array(['Stamford', 'New York', 'Philadelphia'], dtype=object)

In [36]:
df['Item'].unique() # usefull in terms of identifying unique values

array(['Burger', 'Chalupa', 'Sushi', 'Ice Cream', 'Donut'], dtype=object)

In [141]:
len(df['Item'].unique()) # count unique items using len

5

### .nunique

In [37]:
df['Sex'].nunique() # usefull in terms of identifying unique values

2

In [38]:
df['City'].nunique() # usefull in terms of identifying unique values

3

In [39]:
df['Item'].nunique() # usefull in terms of identifying unique values

5

In [40]:
df.nunique() # the whole data frame itself, count number of unique values

Name         7
Sex          2
City         3
Frequency    3
Item         5
Spend        7
dtype: int64

### .value_counts()

In [45]:
df['City'].value_counts() # count number of values

Stamford        3
Philadelphia    2
New York        2
Name: City, dtype: int64

In [46]:
df['Item'].value_counts() # count number of values

Chalupa      2
Sushi        2
Donut        1
Ice Cream    1
Burger       1
Name: Item, dtype: int64

## .apply()

In [51]:
def vat(x):
    
    return x * 0.02

In [57]:
df['Tax'] = df['Spend'].apply(vat)

In [58]:
df

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend,Tax
0,Wanda,Female,Stamford,Weekly,Burger,15,0.3
1,Eric,Male,Stamford,Daily,Chalupa,10,0.2
2,John,Male,New York,Weekly,Sushi,42,0.84
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11,0.22
4,Deborah,Female,Philadelphia,Daily,Chalupa,23,0.46
5,Charles,Male,Stamford,Weekly,Sushi,93,1.86
6,James,Male,New York,Daily,Donut,37,0.74


In [59]:
# Also use lambda expression, remember?

df['Tax'] = df['Spend'].apply(lambda x: x * 0.02)
df

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend,Tax
0,Wanda,Female,Stamford,Weekly,Burger,15,0.3
1,Eric,Male,Stamford,Daily,Chalupa,10,0.2
2,John,Male,New York,Weekly,Sushi,42,0.84
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11,0.22
4,Deborah,Female,Philadelphia,Daily,Chalupa,23,0.46
5,Charles,Male,Stamford,Weekly,Sushi,93,1.86
6,James,Male,New York,Daily,Donut,37,0.74


### .sum(), .mean(), .std(), .var(), .count()

In [61]:
df['Spend'].sum()

231

In [62]:
df['Spend'].mean()

33.0

In [63]:
df['Spend'].std()

29.25178057258509

In [64]:
df['Spend'].var()

855.6666666666666

In [66]:
df['Spend'].count()

7

### .columns, .index

In [67]:
df.columns

Index(['Name', 'Sex', 'City', 'Frequency', 'Item', 'Spend', 'Tax'], dtype='object')

In [68]:
df.index

RangeIndex(start=0, stop=7, step=1)

### .sort_values

In [71]:
df.sort_values(by = 'Spend', ascending = False)

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend,Tax
5,Charles,Male,Stamford,Weekly,Sushi,93,1.86
2,John,Male,New York,Weekly,Sushi,42,0.84
6,James,Male,New York,Daily,Donut,37,0.74
4,Deborah,Female,Philadelphia,Daily,Chalupa,23,0.46
0,Wanda,Female,Stamford,Weekly,Burger,15,0.3
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11,0.22
1,Eric,Male,Stamford,Daily,Chalupa,10,0.2


### .isnull()

In [76]:
df.isnull()

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend,Tax
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
5,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False


In [77]:
# best with sum

df.isnull().sum()

Name         0
Sex          0
City         0
Frequency    0
Item         0
Spend        0
Tax          0
dtype: int64

## .pivot_table()

In [117]:
df

Unnamed: 0,Name,Sex,City,Frequency,Item,Spend,Tax
0,Wanda,Female,Stamford,Weekly,Burger,15,0.3
1,Eric,Male,Stamford,Daily,Chalupa,10,0.2
2,John,Male,New York,Weekly,Sushi,42,0.84
3,Andy,Male,Philadelphia,Monthly,Ice Cream,11,0.22
4,Deborah,Female,Philadelphia,Daily,Chalupa,23,0.46
5,Charles,Male,Stamford,Weekly,Sushi,93,1.86
6,James,Male,New York,Daily,Donut,37,0.74


In [118]:
# pivot table 1

df.pivot_table(values = 'Spend', columns = ['Item'], aggfunc = 'sum') # aggfunc by default is 'mean'

Item,Burger,Chalupa,Donut,Ice Cream,Sushi
Spend,15,33,37,11,135


In [120]:
# pivot table 2

df.pivot_table(values = ['Spend','Tax'], columns = ['Item'], aggfunc = 'sum')

Item,Burger,Chalupa,Donut,Ice Cream,Sushi
Spend,15.0,33.0,37.0,11.0,135.0
Tax,0.3,0.66,0.74,0.22,2.7


In [135]:
# pivot table 3

df.pivot_table(values = 'Spend', columns = ['City'], aggfunc = 'mean')

City,New York,Philadelphia,Stamford
Spend,39.5,17.0,39.333333


In [142]:
# pivot table 4

df.pivot_table(values = ['Spend','Tax'], columns = ['Sex'], aggfunc = 'sum')

Sex,Female,Male
Spend,38.0,193.0
Tax,0.76,3.86
