# Data frame

1. Creating a data frame
    - Manually
    - From CSV and other files
3. Methods on a data frame
4. `.loc` and `.iloc`
5. `nan` and data frames
6. Boolean indexes and data frames



In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]])
df

Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60
2,70,80,90
3,100,110,120


In [3]:
df.dtypes

0    int64
1    int64
2    int64
dtype: object

In [4]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]],
              index=list('abcd'))
df

Unnamed: 0,0,1,2
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [5]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]],
              index=list('abcd'),
              columns=list('xyz'))
df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [6]:
# retrieve from a row... using loc/iloc

df.loc['a']

x    10
y    20
z    30
Name: a, dtype: int64

In [7]:
df.iloc[2]

x    70
y    80
z    90
Name: c, dtype: int64

In [8]:
# fancy indexing

df.loc[['a', 'c']]

Unnamed: 0,x,y,z
a,10,20,30
c,70,80,90


In [9]:
# slice

df.loc['a':'c']

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90


In [10]:
df.iloc[0:2]

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60


In [11]:
# columns -- use []

df['x']

a     10
b     40
c     70
d    100
Name: x, dtype: int64

In [12]:
# more than one column

df[['x', 'z']]

Unnamed: 0,x,z
a,10,30
b,40,60
c,70,90
d,100,120


In [13]:
# watch out -- slices!

df['b':'d']

Unnamed: 0,x,y,z
b,40,50,60
c,70,80,90
d,100,110,120


In [14]:
df['b':'b']

Unnamed: 0,x,y,z
b,40,50,60


In [15]:
# we can use dots for columns

df.x

a     10
b     40
c     70
d    100
Name: x, dtype: int64

In [16]:
# general rule: any series method will also work on a data frame,
# and we will get a result for each column

df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [17]:
df['x'].mean()

np.float64(55.0)

In [18]:
df.mean()

x    55.0
y    65.0
z    75.0
dtype: float64

In [19]:
df.std()

x    38.729833
y    38.729833
z    38.729833
dtype: float64

In [20]:
df.min()

x    10
y    20
z    30
dtype: int64

In [22]:
df.mean(axis='columns')

a     20.0
b     50.0
c     80.0
d    110.0
dtype: float64

In [23]:
df['x'].describe()

count      4.000000
mean      55.000000
std       38.729833
min       10.000000
25%       32.500000
50%       55.000000
75%       77.500000
max      100.000000
Name: x, dtype: float64

In [24]:
df.describe()

Unnamed: 0,x,y,z
count,4.0,4.0,4.0
mean,55.0,65.0,75.0
std,38.729833,38.729833,38.729833
min,10.0,20.0,30.0
25%,32.5,42.5,52.5
50%,55.0,65.0,75.0
75%,77.5,87.5,97.5
max,100.0,110.0,120.0


In [25]:
df['x'] > df['x'].mean()

a    False
b    False
c     True
d     True
Name: x, dtype: bool

In [27]:
# we can use .loc to filter elements with a boolean index
df['x'].loc[   df['x'] > df['x'].mean()   ]

c     70
d    100
Name: x, dtype: int64

In [28]:
# find elements of y where x is greater than the mean
df['y'].loc[   df['x'] > df['x'].mean()   ]

c     80
d    110
Name: y, dtype: int64

In [29]:
# find all rows of df where x > mean
df.loc[   df['x'] > df['x'].mean()   ]

Unnamed: 0,x,y,z
c,70,80,90
d,100,110,120


In [30]:
# change the index

df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [31]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [32]:
df.index = ['one', 'two', 'three', 'four']
df

Unnamed: 0,x,y,z
one,10,20,30
two,40,50,60
three,70,80,90
four,100,110,120


In [33]:
df.columns

Index(['x', 'y', 'z'], dtype='object')

In [34]:
df.columns = ['col1', 'col2', 'col3']
df

Unnamed: 0,col1,col2,col3
one,10,20,30
two,40,50,60
three,70,80,90
four,100,110,120


In [35]:
df['col4'] = [11, 12, 13, 14]

In [36]:
df

Unnamed: 0,col1,col2,col3,col4
one,10,20,30,11
two,40,50,60,12
three,70,80,90,13
four,100,110,120,14


In [37]:
df['col4'] = [110, 120, 130, 140]
df

Unnamed: 0,col1,col2,col3,col4
one,10,20,30,110
two,40,50,60,120
three,70,80,90,130
four,100,110,120,140


In [38]:
df = DataFrame(np.random.randint(0, 1000, [4, 3]),
              index=list('abcd'),
              columns=list('xyz'))
df

Unnamed: 0,x,y,z
a,934,178,53
b,153,486,515
c,758,893,113
d,582,51,336


# Exercise: Data frames

1. Create a data frame with random integers from 0-1,000, with 5 rows and 4 columns. Give the rows names of a-e and the columns w-z.
2. Find the mean + std for columns w and y.
3. Find the mean + std + max for rows b, d, and e.
4. Get all rows where y > y.mean().

In [39]:
df = DataFrame(np.random.randint(0, 1000, [5, 4]),
              index=list('abcde'),
              columns=list('wxyz'))
df

Unnamed: 0,w,x,y,z
a,610,14,753,338
b,686,188,174,411
c,214,711,385,998
d,706,713,311,403
e,249,253,263,903


In [41]:
# 2. Find the mean + std for columns w and y.

df[['w', 'y']].mean()

w    493.0
y    377.2
dtype: float64

In [42]:
df[['w', 'y']].std()

w    241.704365
y    223.607245
dtype: float64

In [46]:
# method chaining

(
    df[['w', 'y']]
    .describe()
    .loc[['mean', 'std']]
)

Unnamed: 0,w,y
mean,493.0,377.2
std,241.704365,223.607245


In [None]:
# 3. Find the mean + std + max for rows b, d, and e.
# 4. Get all rows where y > y.mean().