# Agenda

1. Creating a new data frame
    - Manually
    - From CSV and other files
2. Methods to run on a data frame
3. `.loc` and `.iloc`
4. `nan` and other values in data frames
5. Mask/boolean index

In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [2]:
s = Series([10, 20, 30, 40, 50])
s

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [3]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]])
df

Unnamed: 0,0,1,2
0,10,20,30
1,40,50,60
2,70,80,90
3,100,110,120


In [4]:
df.dtypes

0    int64
1    int64
2    int64
dtype: object

In [5]:
df = DataFrame([[10, 20, 30],
               [40, 50.5, 60],
               [70, 80, 90],
               [100, 110, 120]])
df

Unnamed: 0,0,1,2
0,10,20.0,30
1,40,50.5,60
2,70,80.0,90
3,100,110.0,120


In [6]:
df.dtypes

0      int64
1    float64
2      int64
dtype: object

In [7]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]],
              index=list('abcd'),
              columns=list('xyz'))
df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [8]:
df.loc['a']

x    10
y    20
z    30
Name: a, dtype: int64

In [9]:
df.iloc[2]

x    70
y    80
z    90
Name: c, dtype: int64

In [10]:
df.loc[['a', 'c']]

Unnamed: 0,x,y,z
a,10,20,30
c,70,80,90


In [11]:
df.loc['a':'c']

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90


In [12]:
# for columns, just use []

df['x']

a     10
b     40
c     70
d    100
Name: x, dtype: int64

In [13]:
df.x

a     10
b     40
c     70
d    100
Name: x, dtype: int64

In [14]:
df.x.a

np.int64(10)

In [15]:
df.q

AttributeError: 'DataFrame' object has no attribute 'q'

In [16]:
df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [17]:
df[['x', 'y']]

Unnamed: 0,x,y
a,10,20
b,40,50
c,70,80
d,100,110


In [18]:
df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [19]:
df['x'].mean()

np.float64(55.0)

In [20]:
df['x'].std()

np.float64(38.72983346207417)

In [21]:
df.mean()

x    55.0
y    65.0
z    75.0
dtype: float64

In [22]:
df.std()

x    38.729833
y    38.729833
z    38.729833
dtype: float64

In [23]:
df['x'].describe()

count      4.000000
mean      55.000000
std       38.729833
min       10.000000
25%       32.500000
50%       55.000000
75%       77.500000
max      100.000000
Name: x, dtype: float64

In [24]:
df.describe()

Unnamed: 0,x,y,z
count,4.0,4.0,4.0
mean,55.0,65.0,75.0
std,38.729833,38.729833,38.729833
min,10.0,20.0,30.0
25%,32.5,42.5,52.5
50%,55.0,65.0,75.0
75%,77.5,87.5,97.5
max,100.0,110.0,120.0


In [25]:
df['s'] = 'this is a test'.split()
df

Unnamed: 0,x,y,z,s
a,10,20,30,this
b,40,50,60,is
c,70,80,90,a
d,100,110,120,test


In [26]:
df.dtypes

x     int64
y     int64
z     int64
s    object
dtype: object

In [27]:
df.describe()

Unnamed: 0,x,y,z
count,4.0,4.0,4.0
mean,55.0,65.0,75.0
std,38.729833,38.729833,38.729833
min,10.0,20.0,30.0
25%,32.5,42.5,52.5
50%,55.0,65.0,75.0
75%,77.5,87.5,97.5
max,100.0,110.0,120.0


In [29]:
df['s'].describe()

count        4
unique       4
top       this
freq         1
Name: s, dtype: object

In [30]:
df.describe(include='all')

Unnamed: 0,x,y,z,s
count,4.0,4.0,4.0,4
unique,,,,4
top,,,,this
freq,,,,1
mean,55.0,65.0,75.0,
std,38.729833,38.729833,38.729833,
min,10.0,20.0,30.0,
25%,32.5,42.5,52.5,
50%,55.0,65.0,75.0,
75%,77.5,87.5,97.5,


In [31]:
df

Unnamed: 0,x,y,z,s
a,10,20,30,this
b,40,50,60,is
c,70,80,90,a
d,100,110,120,test


In [33]:
df['x'] > df['x'].mean()

a    False
b    False
c     True
d     True
Name: x, dtype: bool

In [34]:
# I want the valuers in x
# where x is bigger than the mean

df['x'].loc[   df['x'] > df['x'].mean()  ]

c     70
d    100
Name: x, dtype: int64

In [35]:
# I want the values in y
# where x is bigger than the mean of x

df['y'].loc[   df['x'] > df['x'].mean()  ]

c     80
d    110
Name: y, dtype: int64

In [37]:
# I want all of the rows in df
# where x is bigger than the mean of x

df.loc[   df['x'] > df['x'].mean()  ]

Unnamed: 0,x,y,z,s
c,70,80,90,a
d,100,110,120,test


In [38]:
df.index

Index(['a', 'b', 'c', 'd'], dtype='object')

In [39]:
df.index = ['first', 'second', 'third', 'fourth']
df

Unnamed: 0,x,y,z,s
first,10,20,30,this
second,40,50,60,is
third,70,80,90,a
fourth,100,110,120,test


In [40]:

df.columns = list('tuvw')
df

Unnamed: 0,t,u,v,w
first,10,20,30,this
second,40,50,60,is
third,70,80,90,a
fourth,100,110,120,test


In [41]:
df.dtypes

t     int64
u     int64
v     int64
w    object
dtype: object

In [42]:
df

Unnamed: 0,t,u,v,w
first,10,20,30,this
second,40,50,60,is
third,70,80,90,a
fourth,100,110,120,test


In [43]:
df['t'].loc[  df['t'] > df['t'].mean()] = 999
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['t'].loc[  df['t'] > df['t'].mean()] = 999


Unnamed: 0,t,u,v,w
first,10,20,30,this
second,40,50,60,is
third,999,80,90,a
fourth,999,110,120,test


In [44]:
df.loc[ df['t'] < df['t'].mean()] = 888
df

Unnamed: 0,t,u,v,w
first,888,888,888,888
second,888,888,888,888
third,999,80,90,a
fourth,999,110,120,test


In [46]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]],
              index=list('abcd'),
              columns=list('xyz'))
df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [48]:
df.loc[ df['x'] < df['x'].mean() ] = [1,2,3]

In [49]:
df

Unnamed: 0,x,y,z
a,1,2,3
b,1,2,3
c,70,80,90
d,100,110,120


In [50]:
df = DataFrame([[10, 20, 30],
               [40, 50, 60],
               [70, 80, 90],
               [100, 110, 120]],
              index=list('abcd'),
              columns=list('xyz'))
df

Unnamed: 0,x,y,z
a,10,20,30
b,40,50,60
c,70,80,90
d,100,110,120


In [51]:
df.loc[ df['x'] < df['x'].mean() ] = [[1,2,3],[4,5,6]]

In [52]:
df

Unnamed: 0,x,y,z
a,1,2,3
b,4,5,6
c,70,80,90
d,100,110,120


In [53]:
np.random.seed(0)
np.random.randint(0, 100, 10)

array([44, 47, 64, 67, 67,  9, 83, 21, 36, 87])

In [54]:
np.random.randint(0, 100, [3, 4])

array([[70, 88, 88, 12],
       [58, 65, 39, 87],
       [46, 88, 81, 37]])

# Exercise: Data frames

1. Define a data frame with random integers from 0-1,000, with 5 rows and 4 columns. Name the rows a-e, and the columns w-z.
2. Give the mean and std for both columns w and y.
3. Give the mean, std, and max for rows b, d, and e.
4. Show all rows where y is bigger than the mean of z.

In [58]:
df.loc[ df['x'] > df['x'].mean()  ]

Unnamed: 0,x,y,z
c,70,80,90
d,100,110,120


In [63]:
df['x'] > df['x'].mean()

a    False
b    False
c     True
d     True
Name: x, dtype: bool