# Agenda

1. String in a series
2. Data frame
    - Creating
    - Retrieving from a data frame
    - Methods on a data frame

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
s = Series([10, 20, 30])
s

0    10
1    20
2    30
dtype: int64

In [4]:
s = Series([10, 20, 30], dtype=np.int8)
s

0    10
1    20
2    30
dtype: int8

In [5]:
s.dtype

dtype('int8')

In [6]:
s.dtype = np.float16

AttributeError: property 'dtype' of 'Series' object has no setter

In [7]:
s.astype(np.float16)

0    10.0
1    20.0
2    30.0
dtype: float16

In [8]:
a = np.array('hello out there'.split())
a

array(['hello', 'out', 'there'], dtype='<U5')

In [9]:
s = Series('hello out there'.split())

In [10]:
s

0    hello
1      out
2    there
dtype: object

In [12]:
# I want the length of each word

# don't ever do this!
for one_word in s:
    print(len(one_word))

5
3
5


In [13]:
s.str       # str accessor

<pandas.core.strings.accessor.StringMethods at 0x12adb01d0>

In [14]:
s.str.len()

0    5
1    3
2    5
dtype: int64

In [15]:
# which words have 'e' in them?

s.str.contains('e')

0     True
1    False
2     True
dtype: bool

In [16]:
s.loc[s.str.contains('e')]

0    hello
2    there
dtype: object

In [17]:
s

0    hello
1      out
2    there
dtype: object

In [18]:
s = Series('hello out there'.split(),
          index=list('xyz'))
s

x    hello
y      out
z    there
dtype: object

In [19]:
s.loc['x']

'hello'

In [20]:
s.iloc[0]

'hello'

# Exercise: String practice

1. Define a series with 10 words.
2. Find all of the words with odd lengths.
3. Find all of the words with below-average lengths.
4. Find all words containing 'a' or 'e'.

In [21]:
# https://RegexpCrashCourse.com/ 

In [23]:
s = Series('this is a fantastic test sentence for my course today'.split())

In [24]:
s

0         this
1           is
2            a
3    fantastic
4         test
5     sentence
6          for
7           my
8       course
9        today
dtype: object

In [30]:
s.loc[s.str.len() % 2 == 1]

2            a
3    fantastic
6          for
9        today
dtype: object

In [33]:
s.loc[s.str.len() < s.str.len().mean()]

0    this
1      is
2       a
4    test
6     for
7      my
dtype: object

In [34]:
s.iloc[s.str.len() < s.str.len().mean()]

NotImplementedError: iLocation based boolean indexing on an integer type is not available

In [37]:
s.loc[s.str.contains('a') | s.str.contains('e')]

2            a
3    fantastic
4         test
5     sentence
8       course
9        today
dtype: object

In [38]:
# regexp
s.loc[s.str.contains('[ae]')]

2            a
3    fantastic
4         test
5     sentence
8       course
9        today
dtype: object

In [39]:
s.loc[s.str.contains('a|e')]

2            a
3    fantastic
4         test
5     sentence
8       course
9        today
dtype: object

In [40]:
s = Series('10 20 30 40 50'.split())
s

0    10
1    20
2    30
3    40
4    50
dtype: object

In [41]:
s.sum()

'1020304050'

In [42]:
s.mean()

204060810.0

In [43]:
s.astype(np.int8)

0    10
1    20
2    30
3    40
4    50
dtype: int8

In [44]:
s = Series('10 20 30 abcd 40 50'.split())
s

0      10
1      20
2      30
3    abcd
4      40
5      50
dtype: object

In [45]:
s.astype(np.int8)

ValueError: invalid literal for int() with base 10: 'abcd'

In [49]:
s = s.loc[s.str.isdigit()].astype(np.int8)
s.mean()

30.0

In [50]:
s = Series('10 20 30 1a 2b 3c'.split())
s

0    10
1    20
2    30
3    1a
4    2b
5    3c
dtype: object

In [51]:
s.astype(np.int8)

ValueError: invalid literal for int() with base 10: '1a'

In [52]:
int('ab', 16)

171

In [55]:
s.apply(lambda x: int(x,  16))

0    16
1    32
2    48
3    26
4    43
5    60
dtype: int64

# Data frame

Data frames are 2D

- Index / rows
- Columns 

In [56]:
# easiest way to create a data frame -- list of lists or
# a 2D NumPy array

df = DataFrame([[10, 20, 30, 40],
               [50, 60, 70, 80],
               [90, 100, 110, 120]],
              index=list('abc'),
              columns=list('wxyz'))
df

Unnamed: 0,w,x,y,z
a,10,20,30,40
b,50,60,70,80
c,90,100,110,120


In [57]:
# list of dicts

df = DataFrame([{'a':10, 'b':20, 'c':30},
               {'a':100, 'b':200, 'c':300},
               {'a':1000, 'b':2000, 'd':4000}])
df

Unnamed: 0,a,b,c,d
0,10,20,30.0,
1,100,200,300.0,
2,1000,2000,,4000.0


In [58]:
s.dtype

dtype('O')

In [59]:
df.dtypes

a      int64
b      int64
c    float64
d    float64
dtype: object

In [60]:
# defining a data frame as a dict of lists

df = DataFrame({'a':[10, 100, 1000],
               'b':[20, 200, 2000],
               'c':[30, 300, 3000]})
df

Unnamed: 0,a,b,c
0,10,20,30
1,100,200,300
2,1000,2000,3000


In [62]:
df = DataFrame({'a':[10, 100, 1000, 10000, 10001],
               'b':[20, 200, 2000, 20000, 200002],
               'c':[30, 300, 3000, 30000, 300003]},
              index=list('vwxyz'))

df

Unnamed: 0,a,b,c
v,10,20,30
w,100,200,300
x,1000,2000,3000
y,10000,20000,30000
z,10001,200002,300003


In [63]:
# get a column with []
df['a']

v       10
w      100
x     1000
y    10000
z    10001
Name: a, dtype: int64

In [64]:
# get two columns with [[]]
df[['a', 'b']]

Unnamed: 0,a,b
v,10,20
w,100,200
x,1000,2000
y,10000,20000
z,10001,200002


In [65]:
# what about the rows?
# .loc or .iloc

df.loc['w']

a    100
b    200
c    300
Name: w, dtype: int64

In [66]:
df.loc[['w', 'x']]

Unnamed: 0,a,b,c
w,100,200,300
x,1000,2000,3000


In [67]:
df.loc['a']

KeyError: 'a'

In [68]:
df

Unnamed: 0,a,b,c
v,10,20,30
w,100,200,300
x,1000,2000,3000
y,10000,20000,30000
z,10001,200002,300003


In [69]:
# retrieve via .loc[row, column]

df.loc['y', 'b']

20000

In [70]:
# when we use .loc, we can actually say
# .loc[row_selector, column_selector]

# each selector can be
# - string
# - list of strings
# - boolean series


In [71]:
df.loc['x', ['a', 'c']]

a    1000
c    3000
Name: x, dtype: int64

In [75]:
df.loc[
    ['v', 'y'],   # row selector
      ['a', 'b']  # column selector
]

Unnamed: 0,a,b
v,10,20
y,10000,20000


In [76]:
df

Unnamed: 0,a,b,c
v,10,20,30
w,100,200,300
x,1000,2000,3000
y,10000,20000,30000
z,10001,200002,300003


In [77]:
# we can assign via .loc

df.loc['v', 'c'] = 12.34
df

Unnamed: 0,a,b,c
v,10,20,12.34
w,100,200,300.0
x,1000,2000,3000.0
y,10000,20000,30000.0
z,10001,200002,300003.0


In [78]:
df.loc['v']

a    10.00
b    20.00
c    12.34
Name: v, dtype: float64

# Exercise: Data frames and `.loc`

1. Create a 5x5 data frame with random integers from 0-1,000.  Have an index of a-e, and columns of v-z.
2. Retreive row b.
3. Retrieve rows b and d
4. Retrieve rows b, c, and d
5. Retrieve column w
6. Retrieve columns w and y
7. Retrieve columns w, y, and z


In [79]:
np.random.seed(0)

df = DataFrame(np.random.randint(0, 1000, [5,5]),
              index=list('abcde'),
              columns=list('vwxyz'))
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705
e,486,551,87,174,600


In [80]:
# row b, option 1
df.loc['b']

v    763
w    707
x    359
y      9
z    723
Name: b, dtype: int64

In [81]:
# row b, option 2
df.iloc[1]

v    763
w    707
x    359
y      9
z    723
Name: b, dtype: int64

In [82]:
df.loc[['b']]

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723


In [83]:
# 3. Retrieve rows b and d

df.loc[['b', 'd']]

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723
d,472,600,396,314,705


In [85]:
# 3, option 2 -- use a slice
df.loc['b':'d':2]

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723
d,472,600,396,314,705


In [86]:
# 4. Retrieve rows b, c, and d

df.loc['b':'d']

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705


In [87]:
# iloc also works

df.iloc[1:4]

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705


In [88]:
df.iloc[[1,2,3]]

Unnamed: 0,v,w,x,y,z
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705


In [89]:
# 5. Retrieve column w
df['w']

a    559
b    707
c    754
d    600
e    551
Name: w, dtype: int64

In [90]:
df.w

a    559
b    707
c    754
d    600
e    551
Name: w, dtype: int64

In [91]:
# 6. Retrieve columns w and y

df[['w', 'y']]

Unnamed: 0,w,y
a,559,192
b,707,9
c,754,599
d,600,314
e,551,174


In [92]:
df.loc[:, 'w':'y':2]

Unnamed: 0,w,y
a,559,192
b,707,9
c,754,599
d,600,314
e,551,174


In [93]:
# 7. Retrieve columns w, y, and z

df[['w', 'y', 'z']]

Unnamed: 0,w,y,z
a,559,192,835
b,707,9,723
c,754,599,70
d,600,314,705
e,551,174,600


# Next up

1. Assignment
2. Double squre brackets warning!
3. Methods
4. Queries


In [94]:
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705
e,486,551,87,174,600


In [95]:
# assign with .loc, specifying row and column

df.loc['c', 'x'] = 123
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,123,599,70
d,472,600,396,314,705
e,486,551,87,174,600


In [96]:
df.loc['c', 'x'] = 'hello'
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,hello,599,70
d,472,600,396,314,705
e,486,551,87,174,600


In [97]:
df.dtypes

v     int64
w     int64
x    object
y     int64
z     int64
dtype: object

In [98]:
df.loc['c', 'x'] = 123
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,123,599,70
d,472,600,396,314,705
e,486,551,87,174,600


In [99]:
df.dtypes

v     int64
w     int64
x    object
y     int64
z     int64
dtype: object

In [100]:
df['x'] = df['x'].astype(np.int64)

In [101]:
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,123,599,70
d,472,600,396,314,705
e,486,551,87,174,600


In [102]:
# assigning multiple values

df.loc[['c', 'd'], 'x']

c    123
d    396
Name: x, dtype: int64

In [103]:
df.loc[['c', 'd'], 'x'] = [999, 888]
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,999,599,70
d,472,600,888,314,705
e,486,551,87,174,600


In [104]:
# what if I assign just one thing?
df.loc[['c', 'd'], 'x'] = 777
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,777,599,70
d,472,600,777,314,705
e,486,551,87,174,600


In [105]:
df.loc[:, 'x'] = 12.34
df

Unnamed: 0,v,w,x,y,z
a,684,559,12.34,192,835
b,763,707,12.34,9,723
c,277,754,12.34,599,70
d,472,600,12.34,314,705
e,486,551,12.34,174,600


In [106]:
df.loc['c']

v    277.00
w    754.00
x     12.34
y    599.00
z     70.00
Name: c, dtype: float64

In [107]:
# let's change 'y' in row 'c'
# what if we do this...

df.loc['c']['y'] = 24.68


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc['c']['y'] = 24.68


In [108]:
df.loc['c', 'y'] = 24.68

In [111]:
df['m'] = 10

In [112]:
df

Unnamed: 0,v,w,x,y,z,m
a,684,559,12.34,192.0,835,10
b,763,707,12.34,9.0,723,10
c,277,754,12.34,24.68,70,10
d,472,600,12.34,314.0,705,10
e,486,551,12.34,174.0,600,10


In [113]:
df['m'] = 20

In [114]:
df

Unnamed: 0,v,w,x,y,z,m
a,684,559,12.34,192.0,835,20
b,763,707,12.34,9.0,723,20
c,277,754,12.34,24.68,70,20
d,472,600,12.34,314.0,705,20
e,486,551,12.34,174.0,600,20


In [121]:
df.loc['f'] = 99.1

In [122]:
df

Unnamed: 0,v,w,x,y,z,m
a,684.0,559.0,12.34,192.0,835.0,20.0
b,763.0,707.0,12.34,9.0,723.0,20.0
c,277.0,754.0,12.34,24.68,70.0,20.0
d,472.0,600.0,12.34,314.0,705.0,20.0
e,486.0,551.0,12.34,174.0,600.0,20.0
f,99.1,99.1,99.1,99.1,99.1,99.1


In [123]:
df

Unnamed: 0,v,w,x,y,z,m
a,684.0,559.0,12.34,192.0,835.0,20.0
b,763.0,707.0,12.34,9.0,723.0,20.0
c,277.0,754.0,12.34,24.68,70.0,20.0
d,472.0,600.0,12.34,314.0,705.0,20.0
e,486.0,551.0,12.34,174.0,600.0,20.0
f,99.1,99.1,99.1,99.1,99.1,99.1


In [124]:
df['v'].mean()

463.51666666666665

In [125]:
# get all values in column v > the average
df.loc[
    df['v'] > df['v'].mean(),   # row selector
    'v'    # column selector
]

a     True
b     True
c    False
d     True
e     True
f    False
Name: v, dtype: bool

In [127]:
# get all values in y where v > mean

df.loc[
    df['v'] > df['v'].mean(),   # row selector
    'y'    # column selector
]

a    192.0
b      9.0
d    314.0
e    174.0
Name: y, dtype: float64

In [128]:
df['y'].count()

6

In [129]:
df.loc[
    df['v'] > df['v'].mean(),   # row selector
    'y'    # column selector
].describe()

count      4.000000
mean     172.250000
std      125.348514
min        9.000000
25%      132.750000
50%      183.000000
75%      222.500000
max      314.000000
Name: y, dtype: float64

# Exercise: Using `.loc`

1. Create a 5x5 random data frame, index a-e, column v-z, random integers 0-1,000.
2. Which elements of w are even and greater than the mean?
3. Which elements of v and x match where w is even and > mean?
4. Replace the item at index d, column y with the mean of column y

In [130]:
df.mean()  

v    463.516667
w    545.016667
x     26.800000
y    135.463333
z    505.350000
m     33.183333
dtype: float64

In [134]:
%timeit df.mean().mean()

557 µs ± 12.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [135]:
%timeit df.values.mean()

20.5 µs ± 222 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [137]:
%timeit df.mean(axis=None)

57 µs ± 2.52 µs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [138]:
# 1. Create a 5x5 random data frame, index a-e,
# column v-z, random integers 0-1,000.

np.random.seed(0)
df = DataFrame(np.random.randint(0, 1000, [5,5]),
              index=list('abcde'),
              columns=list('vwxyz'))
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192,835
b,763,707,359,9,723
c,277,754,804,599,70
d,472,600,396,314,705
e,486,551,87,174,600


In [141]:
# 2. Which elements of w are even and greater than the mean?

df['w'][(df['w'] % 2 == 0) & (df['w'] > df['w'].mean())]

c    754
Name: w, dtype: int64

In [142]:
# using .loc:

df.loc[
    (df['w'] % 2 == 0) & (df['w'] > df['w'].mean())  # row selector
    ,
    'w'   # column selector
]

c    754
Name: w, dtype: int64

In [143]:
# 3. Which elements of v and x match where w is even and > mean?

# using .loc:

df.loc[
    (df['w'] % 2 == 0) & (df['w'] > df['w'].mean())  # row selector
    ,
    ['v', 'x']  # column selector
]

Unnamed: 0,v,x
c,277,804


In [149]:
# 4. Replace the item at index d, column y 
# with the mean of column y

df.loc['d', 'y'] = df['y'].mean()

In [150]:
df

Unnamed: 0,v,w,x,y,z
a,684,559,629,192.0,835
b,763,707,359,9.0,723
c,277,754,804,599.0,70
d,472,600,396,257.6,705
e,486,551,87,174.0,600


In [151]:
df.mean()

v    536.40
w    634.20
x    455.00
y    246.32
z    586.60
dtype: float64

In [152]:
df.min()

v    277.0
w    551.0
x     87.0
y      9.0
z     70.0
dtype: float64

In [153]:
df.max()

v    763.0
w    754.0
x    804.0
y    599.0
z    835.0
dtype: float64

In [154]:
df.describe()

Unnamed: 0,v,w,x,y,z
count,5.0,5.0,5.0,5.0,5.0
mean,536.4,634.2,455.0,246.32,586.6
std,191.774086,91.376693,273.951638,217.377579,300.574949
min,277.0,551.0,87.0,9.0,70.0
25%,472.0,559.0,359.0,174.0,600.0
50%,486.0,600.0,396.0,192.0,705.0
75%,684.0,707.0,629.0,257.6,723.0
max,763.0,754.0,804.0,599.0,835.0


In [156]:
df['s'] = 'hello out there everyone today'.split()

In [157]:
df

Unnamed: 0,v,w,x,y,z,s
a,684,559,629,192.0,835,hello
b,763,707,359,9.0,723,out
c,277,754,804,599.0,70,there
d,472,600,396,257.6,705,everyone
e,486,551,87,174.0,600,today


In [158]:
df.describe()

Unnamed: 0,v,w,x,y,z
count,5.0,5.0,5.0,5.0,5.0
mean,536.4,634.2,455.0,246.32,586.6
std,191.774086,91.376693,273.951638,217.377579,300.574949
min,277.0,551.0,87.0,9.0,70.0
25%,472.0,559.0,359.0,174.0,600.0
50%,486.0,600.0,396.0,192.0,705.0
75%,684.0,707.0,629.0,257.6,723.0
max,763.0,754.0,804.0,599.0,835.0


In [159]:
df['s'].describe()

count         5
unique        5
top       hello
freq          1
Name: s, dtype: object

In [160]:
df.T

Unnamed: 0,a,b,c,d,e
v,684,763,277,472,486
w,559,707,754,600,551
x,629,359,804,396,87
y,192.0,9.0,599.0,257.6,174.0
z,835,723,70,705,600
s,hello,out,there,everyone,today


In [161]:
df.T.describe()

Unnamed: 0,a,b,c,d,e
count,6,6,6,6,6
unique,6,6,6,6,6
top,684,763,277,472,486
freq,1,1,1,1,1


In [163]:
df.describe(include='all')

Unnamed: 0,v,w,x,y,z,s
count,5.0,5.0,5.0,5.0,5.0,5
unique,,,,,,5
top,,,,,,hello
freq,,,,,,1
mean,536.4,634.2,455.0,246.32,586.6,
std,191.774086,91.376693,273.951638,217.377579,300.574949,
min,277.0,551.0,87.0,9.0,70.0,
25%,472.0,559.0,359.0,174.0,600.0,
50%,486.0,600.0,396.0,192.0,705.0,
75%,684.0,707.0,629.0,257.6,723.0,


In [165]:
df.T

Unnamed: 0,a,b,c,d,e
v,684,763,277,472,486
w,559,707,754,600,551
x,629,359,804,396,87
y,192.0,9.0,599.0,257.6,174.0
z,835,723,70,705,600
s,hello,out,there,everyone,today


In [166]:
df

Unnamed: 0,v,w,x,y,z,s
a,684,559,629,192.0,835,hello
b,763,707,359,9.0,723,out
c,277,754,804,599.0,70,there
d,472,600,396,257.6,705,everyone
e,486,551,87,174.0,600,today


In [167]:
df.loc['d', 'x'] = np.nan
df.loc['c', 'y'] = np.nan
df.loc['a', 'x'] = np.nan


In [168]:
df

Unnamed: 0,v,w,x,y,z,s
a,684,559,,192.0,835,hello
b,763,707,359.0,9.0,723,out
c,277,754,804.0,,70,there
d,472,600,,257.6,705,everyone
e,486,551,87.0,174.0,600,today


In [169]:
df.loc['d', 'v'] = np.nan

In [170]:
df

Unnamed: 0,v,w,x,y,z,s
a,684.0,559,,192.0,835,hello
b,763.0,707,359.0,9.0,723,out
c,277.0,754,804.0,,70,there
d,,600,,257.6,705,everyone
e,486.0,551,87.0,174.0,600,today


In [171]:
df.dropna() 

Unnamed: 0,v,w,x,y,z,s
b,763.0,707,359.0,9.0,723,out
e,486.0,551,87.0,174.0,600,today


In [174]:
# option 1: use thresh = (number of non-NaN values)

df.dropna(thresh=4)

Unnamed: 0,v,w,x,y,z,s
a,684.0,559,,192.0,835,hello
b,763.0,707,359.0,9.0,723,out
c,277.0,754,804.0,,70,there
d,,600,,257.6,705,everyone
e,486.0,551,87.0,174.0,600,today


In [175]:
# option 2: restrict the filter to certain columns
# subset=[list_of_column_names]

df.dropna(subset=['z', 's'])

Unnamed: 0,v,w,x,y,z,s
a,684.0,559,,192.0,835,hello
b,763.0,707,359.0,9.0,723,out
c,277.0,754,804.0,,70,there
d,,600,,257.6,705,everyone
e,486.0,551,87.0,174.0,600,today


In [176]:
df.shape

(5, 6)

In [177]:
# fillna -- with a value

df.fillna(999)

Unnamed: 0,v,w,x,y,z,s
a,684.0,559,999.0,192.0,835,hello
b,763.0,707,359.0,9.0,723,out
c,277.0,754,804.0,999.0,70,there
d,999.0,600,999.0,257.6,705,everyone
e,486.0,551,87.0,174.0,600,today


In [179]:
# df.drop -- removes a row
df.drop('e')

Unnamed: 0,v,w,x,y,z,s
a,684.0,559,,192.0,835,hello
b,763.0,707,359.0,9.0,723,out
c,277.0,754,804.0,,70,there
d,,600,,257.6,705,everyone


In [180]:
df.drop(['d', 'e'])

Unnamed: 0,v,w,x,y,z,s
a,684.0,559,,192.0,835,hello
b,763.0,707,359.0,9.0,723,out
c,277.0,754,804.0,,70,there


In [182]:
# removing a column -- axis='columns'

df = df.drop('s', axis='columns')
df

Unnamed: 0,v,w,x,y,z
a,684.0,559,,192.0,835
b,763.0,707,359.0,9.0,723
c,277.0,754,804.0,,70
d,,600,,257.6,705
e,486.0,551,87.0,174.0,600


In [183]:
# fillna -- with a series

df.fillna(df.mean())

Unnamed: 0,v,w,x,y,z
a,684.0,559,416.666667,192.0,835
b,763.0,707,359.0,9.0,723
c,277.0,754,804.0,158.15,70
d,552.5,600,416.666667,257.6,705
e,486.0,551,87.0,174.0,600


In [184]:
# interpolate

df.interpolate()

Unnamed: 0,v,w,x,y,z
a,684.0,559,,192.0,835
b,763.0,707,359.0,9.0,723
c,277.0,754,804.0,133.3,70
d,381.5,600,445.5,257.6,705
e,486.0,551,87.0,174.0,600


In [185]:
help(df.interpolate)

Help on method interpolate in module pandas.core.frame:

interpolate(method: 'str' = 'linear', *, axis: 'Axis' = 0, limit: 'int | None' = None, inplace: 'bool' = False, limit_direction: 'str | None' = None, limit_area: 'str | None' = None, downcast: 'str | None' = None, **kwargs) -> 'DataFrame | None' method of pandas.core.frame.DataFrame instance
    Fill NaN values using an interpolation method.
    
    Please note that only ``method='linear'`` is supported for
    DataFrame/Series with a MultiIndex.
    
    Parameters
    ----------
    method : str, default 'linear'
        Interpolation technique to use. One of:
    
        * 'linear': Ignore the index and treat the values as equally
          spaced. This is the only method supported on MultiIndexes.
        * 'time': Works on daily and higher resolution data to interpolate
          given length of interval.
        * 'index', 'values': use the actual numerical values of the index.
        * 'pad': Fill in NaNs using existing

In [187]:
df.loc['a', 'v'] = np.nan

In [188]:
df.interpolate()

Unnamed: 0,v,w,x,y,z
a,,559,,192.0,835
b,763.0,707,359.0,9.0,723
c,277.0,754,804.0,133.3,70
d,381.5,600,445.5,257.6,705
e,486.0,551,87.0,174.0,600


In [193]:
df.interpolate(limit_direction='both')

Unnamed: 0,v,w,x,y,z
a,763.0,559,359.0,192.0,835
b,763.0,707,359.0,9.0,723
c,277.0,754,804.0,133.3,70
d,381.5,600,445.5,257.6,705
e,486.0,551,87.0,174.0,600


In [194]:
df.head(2)

Unnamed: 0,v,w,x,y,z
a,,559,,192.0,835
b,763.0,707,359.0,9.0,723


In [195]:
df.tail(2)

Unnamed: 0,v,w,x,y,z
d,,600,,257.6,705
e,486.0,551,87.0,174.0,600


In [196]:
df.sort_index()

Unnamed: 0,v,w,x,y,z
a,,559,,192.0,835
b,763.0,707,359.0,9.0,723
c,277.0,754,804.0,,70
d,,600,,257.6,705
e,486.0,551,87.0,174.0,600


In [197]:
df.sort_values('v')

Unnamed: 0,v,w,x,y,z
c,277.0,754,804.0,,70
e,486.0,551,87.0,174.0,600
b,763.0,707,359.0,9.0,723
a,,559,,192.0,835
d,,600,,257.6,705


In [198]:
df['v'].sort_values()

c    277.0
e    486.0
b    763.0
a      NaN
d      NaN
Name: v, dtype: float64

In [199]:
df.index

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [200]:
df.columns

Index(['v', 'w', 'x', 'y', 'z'], dtype='object')

In [203]:
[df[one_column].sort_values()
for one_column in df.columns]

[c    277.0
 e    486.0
 b    763.0
 a      NaN
 d      NaN
 Name: v, dtype: float64,
 e    551
 a    559
 d    600
 b    707
 c    754
 Name: w, dtype: int64,
 e     87.0
 b    359.0
 c    804.0
 a      NaN
 d      NaN
 Name: x, dtype: float64,
 b      9.0
 e    174.0
 a    192.0
 d    257.6
 c      NaN
 Name: y, dtype: float64,
 c     70
 e    600
 d    705
 b    723
 a    835
 Name: z, dtype: int64]

# Exercise: Store

1. Create a data frame for a store, with the following columns:
    - Name
    - Price
    - Number of sales
2. Add 5 items to your store.
3. How much did we earn, total?
4. What two products brought in the most income?

In [None]:
df = DataFrame([{'name': }])