# Pandas DataFrames
DataFrames are tabular representations of data, like in Microsoft Excel.

In [39]:
import numpy as np
import pandas as pd

In [40]:
from numpy.random import randn

In [41]:
# set a seed to receive the same random numbers from numpy
np.random.seed(101)

In [42]:
"""
like Series, it has a data and index parameters,
but DataFrames have a columns parameters as the 3rd parameter
"""
df = pd.DataFrame(randn(5, 4), ['A', 'B', 'C', 'D', 'E'], ['W', 'X', 'Y', 'Z'])

"""
data is the data parameter, the rows are the index
parameter, columns are the column parameter
"""
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


### Indexing a DataFrame using Columns

In [43]:
# show that the df is a DataFrame type
df_type = type(df) # DataFrame
print(df_type)

# index the "W" column, and show that it's a Series type
w_series = df["W"]
w_type = type(w_series) # Series
print(w_type)

w_series

<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.series.Series'>


A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [44]:
# another way to also index a column is the "SQL" style (not recommened)
df.W

A    2.706850
B    0.651118
C   -2.018168
D    0.188695
E    0.190794
Name: W, dtype: float64

In [45]:
"""
index the "W" and "Z" columns, returns as a
DataFrame because it requires multiple column
"""
w_z_df = df[["W", "Z"]]
print(type(w_z_df))

w_z_df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,W,Z
A,2.70685,0.503826
B,0.651118,0.605965
C,-2.018168,-0.589001
D,0.188695,0.955057
E,0.190794,0.683509


In [46]:
# create a "new" column onto df as a sum of column "W" and "Y"
df["new"] = df["W"] + df["Y"]

df

Unnamed: 0,W,X,Y,Z,new
A,2.70685,0.628133,0.907969,0.503826,3.614819
B,0.651118,-0.319318,-0.848077,0.605965,-0.196959
C,-2.018168,0.740122,0.528813,-0.589001,-1.489355
D,0.188695,-0.758872,-0.933237,0.955057,-0.744542
E,0.190794,1.978757,2.605967,0.683509,2.796762


In [47]:
"""
delete (drop) the "new" column from df

axis=0 (default) refers to deleting an index (row),
axis=1 refers to deleting a column,
inplace=True means to modify the DataFrame (safety measure in Pandas)
"""
df.drop("new", axis=1)

# if you see, the "new" column actually did NOT get removed
print(df)

# you can use df = df.drop("new", axis=1)
# better yet, or use inplace=True in order to modify the DataFrame
df.drop("new", axis=1, inplace=True)
print(df)

          W         X         Y         Z       new
A  2.706850  0.628133  0.907969  0.503826  3.614819
B  0.651118 -0.319318 -0.848077  0.605965 -0.196959
C -2.018168  0.740122  0.528813 -0.589001 -1.489355
D  0.188695 -0.758872 -0.933237  0.955057 -0.744542
E  0.190794  1.978757  2.605967  0.683509  2.796762
          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
B  0.651118 -0.319318 -0.848077  0.605965
C -2.018168  0.740122  0.528813 -0.589001
D  0.188695 -0.758872 -0.933237  0.955057
E  0.190794  1.978757  2.605967  0.683509


In [48]:
# drop the "E" row, but not inplace so it doesn't actually modify it
df.drop("E")

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057


In [49]:
# returns (rows, columns) it's also why axis=0 is rows and axis=1 is columns
df.shape

(5, 4)

### Indexing a DataFrame using Rows

In [50]:
# returns a Series of the data in the "A" index/row
print(df.loc["A"])
print()

# return a Series of the data in the 0th index/row
print(df.iloc[0])

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64

W    2.706850
X    0.628133
Y    0.907969
Z    0.503826
Name: A, dtype: float64


In [51]:
# return the value at row B, column Y
print(df.loc["B", "Y"])

df

-0.8480769834036315


Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [52]:
# return a DataFrame of data at rows A and B, and columns W and Y
print(df.loc[['A', 'B'], ['W', 'Y']])

df

          W         Y
A  2.706850  0.907969
B  0.651118 -0.848077


Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


# Conditional Indexing on DataFrames

In [53]:
# return a boolean DataFrame where the values in df are greater than 0
bool_df = df > 0

bool_df

Unnamed: 0,W,X,Y,Z
A,True,True,True,True
B,True,False,False,True
C,False,True,True,False
D,True,False,False,True
E,True,True,True,True


In [54]:
# return the values where bool_df is True, and NaN where it's False
df[bool_df]

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,,,0.605965
C,,0.740122,0.528813,
D,0.188695,,,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [55]:
# return a Series where the W column's values are greater than 0
bool_w = df["W"] > 0

bool_w

A     True
B     True
C    False
D     True
E     True
Name: W, dtype: bool

In [56]:
# return a DataFrame of only the rows where bool_w is True
print(df[bool_w])
print()

# another example, return only the rows where df["Z"] < 0 is True
print(df[df["Z"] < 0])

          W         X         Y         Z
A  2.706850  0.628133  0.907969  0.503826
B  0.651118 -0.319318 -0.848077  0.605965
D  0.188695 -0.758872 -0.933237  0.955057
E  0.190794  1.978757  2.605967  0.683509

          W         X         Y         Z
C -2.018168  0.740122  0.528813 -0.589001


In [57]:
# return the Series of column X from df where df["W"] > 0 is True
df[df["W"] > 0]["X"]

A    0.628133
B   -0.319318
D   -0.758872
E    1.978757
Name: X, dtype: float64

# Multiple Conditional Indexing on DataFrames

In [58]:
"""
return the DataFrame where df["W"] > 0 and df["Y"] > 1

cannot use Python's normal "and" operator because df["W"] > 0
and df["Y"] > 1 are Series NOT single booleans. Therefore,
the "and" uses & in Pandas, and "or" uses | in Pandas.
"""
df[(df["W"] > 0) & (df["Y"] > 1)]

Unnamed: 0,W,X,Y,Z
E,0.190794,1.978757,2.605967,0.683509


# Setting the Index of DataFrames

In [59]:
df

Unnamed: 0,W,X,Y,Z
A,2.70685,0.628133,0.907969,0.503826
B,0.651118,-0.319318,-0.848077,0.605965
C,-2.018168,0.740122,0.528813,-0.589001
D,0.188695,-0.758872,-0.933237,0.955057
E,0.190794,1.978757,2.605967,0.683509


In [60]:
"""
reset the index (must inplace=True to modify the DataFrame)
the original index becomes a column of the DataFrame
"""
df.reset_index()

Unnamed: 0,index,W,X,Y,Z
0,A,2.70685,0.628133,0.907969,0.503826
1,B,0.651118,-0.319318,-0.848077,0.605965
2,C,-2.018168,0.740122,0.528813,-0.589001
3,D,0.188695,-0.758872,-0.933237,0.955057
4,E,0.190794,1.978757,2.605967,0.683509


In [63]:
# create a list of Strings using the split() method
new_index = "CA NY WY OR CO".split()

# create a new column of States using the new_index list
df["States"] = new_index

df

Unnamed: 0,W,X,Y,Z,States
A,2.70685,0.628133,0.907969,0.503826,CA
B,0.651118,-0.319318,-0.848077,0.605965,NY
C,-2.018168,0.740122,0.528813,-0.589001,WY
D,0.188695,-0.758872,-0.933237,0.955057,OR
E,0.190794,1.978757,2.605967,0.683509,CO


In [65]:
"""
set the index as the "States" column,
must inplace=True to modify the DataFrame
"""
df.set_index("States")

Unnamed: 0_level_0,W,X,Y,Z
States,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
CA,2.70685,0.628133,0.907969,0.503826
NY,0.651118,-0.319318,-0.848077,0.605965
WY,-2.018168,0.740122,0.528813,-0.589001
OR,0.188695,-0.758872,-0.933237,0.955057
CO,0.190794,1.978757,2.605967,0.683509
