# Pandas Operations

In [54]:
import numpy as np
import pandas as pd

In [55]:
df = pd.DataFrame({'col1':[1, 2, 3, 4],
                   'col2':[444, 555, 666, 444],
                   'col3':['abc', 'def', 'ghi', 'xyz']})

"""
return the first few of rows as a DataFrame,
if small enough it returns the entire DataFrame
"""
df.head()

Unnamed: 0,col1,col2,col3
0,1,444,abc
1,2,555,def
2,3,666,ghi
3,4,444,xyz


In [56]:
# return a numpy Array of all the unique values in the "col2" column
print(df["col2"].unique())
print()

# return the number of unique values in the "col2" column
print(df["col2"].nunique())
print()

"""
return a Series of the number of items a
value occurred in the "col2" column
"""
print(df["col2"].value_counts())

[444 555 666]

3

444    2
555    1
666    1
Name: col2, dtype: int64


### Pandas Apply Method
Allows us to use our own custom methods and apply them.

In [57]:
# multiply a value by 2
def times2(x):
    return x * 2

# apply the times2 method onto the "col1" column, return result as a Series
print(df["col1"].apply(times2))
print()

# but apply is also well-used with lambda expressions in Python
print(df["col1"].apply(lambda x: x * 2))
print()

# we can also apply built-in functions such as len (length)
print(df["col3"].apply(len))

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

0    2
1    4
2    6
3    8
Name: col1, dtype: int64

0    3
1    3
2    3
3    3
Name: col3, dtype: int64


### DataFrame Attributes

In [58]:
# returns the columns as an Index Object that contains a list
df.columns

Index(['col1', 'col2', 'col3'], dtype='object')

In [59]:
"""
returns the index/rows as an Index Object that contains a list,
this specific DataFrame uses a range from 0 to 4 with step-size of 1
"""
df.index

RangeIndex(start=0, stop=4, step=1)

In [60]:
# sort the data frame by the "col2" column, also swaps the index positioning
df.sort_values("col2")

Unnamed: 0,col1,col2,col3
0,1,444,abc
3,4,444,xyz
1,2,555,def
2,3,666,ghi


In [61]:
# return boolean values in the DataFrame where the values are null
df.isnull()

Unnamed: 0,col1,col2,col3
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


### Pandas Pivot Table
Similar to Microsoft Excel's spreadsheet-style pivot table with MultiIndexes.

In [62]:
data = {'A':['foo','foo','foo','bar','bar','bar'],
     'B':['one','one','two','two','one','one'],
       'C':['x','y','x','y','x','y'],
       'D':[1,3,2,5,4,1]}

df = pd.DataFrame(data)

df

Unnamed: 0,A,B,C,D
0,foo,one,x,1
1,foo,one,y,3
2,foo,two,x,2
3,bar,two,y,5
4,bar,one,x,4
5,bar,one,y,1


In [63]:
"""
create a multi-index DataFrame using values from the column A and B,
the columns use the values from the column C,
and the values are from the column D
"""
df.pivot_table(values="D", index=["A", "B"], columns=["C"])

Unnamed: 0_level_0,C,x,y
A,B,Unnamed: 2_level_1,Unnamed: 3_level_1
bar,one,4.0,1.0
bar,two,,5.0
foo,one,1.0,3.0
foo,two,2.0,
