# Agenda

1. String in a series
2. Data frame
    - Creating
    - Retrieving from a data frame
    - Methods on a data frame

In [2]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame

In [3]:
s = Series([10, 20, 30])
s

0    10
1    20
2    30
dtype: int64

In [4]:
s = Series([10, 20, 30], dtype=np.int8)
s

0    10
1    20
2    30
dtype: int8

In [5]:
s.dtype

dtype('int8')

In [6]:
s.dtype = np.float16

AttributeError: property 'dtype' of 'Series' object has no setter

In [7]:
s.astype(np.float16)

0    10.0
1    20.0
2    30.0
dtype: float16

In [8]:
a = np.array('hello out there'.split())
a

array(['hello', 'out', 'there'], dtype='<U5')

In [9]:
s = Series('hello out there'.split())

In [10]:
s

0    hello
1      out
2    there
dtype: object

In [12]:
# I want the length of each word

# don't ever do this!
for one_word in s:
    print(len(one_word))

5
3
5


In [13]:
s.str       # str accessor

<pandas.core.strings.accessor.StringMethods at 0x12adb01d0>

In [14]:
s.str.len()

0    5
1    3
2    5
dtype: int64

In [15]:
# which words have 'e' in them?

s.str.contains('e')

0     True
1    False
2     True
dtype: bool

In [16]:
s.loc[s.str.contains('e')]

0    hello
2    there
dtype: object

In [17]:
s

0    hello
1      out
2    there
dtype: object

In [18]:
s = Series('hello out there'.split(),
          index=list('xyz'))
s

x    hello
y      out
z    there
dtype: object

In [19]:
s.loc['x']

'hello'

In [20]:
s.iloc[0]

'hello'

# Exercise: String practice

1. Define a series with 10 words.
2. Find all of the words with odd lengths.
3. Find all of the words with below-average lengths.
4. Find all words containing 'a' or 'e'.

In [21]:
# https://RegexpCrashCourse.com/ 

In [23]:
s = Series('this is a fantastic test sentence for my course today'.split())

In [24]:
s

0         this
1           is
2            a
3    fantastic
4         test
5     sentence
6          for
7           my
8       course
9        today
dtype: object

In [30]:
s.loc[s.str.len() % 2 == 1]

2            a
3    fantastic
6          for
9        today
dtype: object

In [33]:
s.loc[s.str.len() < s.str.len().mean()]

0    this
1      is
2       a
4    test
6     for
7      my
dtype: object

In [34]:
s.iloc[s.str.len() < s.str.len().mean()]

NotImplementedError: iLocation based boolean indexing on an integer type is not available

In [37]:
s.loc[s.str.contains('a') | s.str.contains('e')]

2            a
3    fantastic
4         test
5     sentence
8       course
9        today
dtype: object

In [38]:
# regexp
s.loc[s.str.contains('[ae]')]

2            a
3    fantastic
4         test
5     sentence
8       course
9        today
dtype: object

In [39]:
s.loc[s.str.contains('a|e')]

2            a
3    fantastic
4         test
5     sentence
8       course
9        today
dtype: object

In [40]:
s = Series('10 20 30 40 50'.split())
s

0    10
1    20
2    30
3    40
4    50
dtype: object

In [41]:
s.sum()

'1020304050'

In [42]:
s.mean()

204060810.0

In [43]:
s.astype(np.int8)

0    10
1    20
2    30
3    40
4    50
dtype: int8

In [44]:
s = Series('10 20 30 abcd 40 50'.split())
s

0      10
1      20
2      30
3    abcd
4      40
5      50
dtype: object

In [45]:
s.astype(np.int8)

ValueError: invalid literal for int() with base 10: 'abcd'

In [49]:
s = s.loc[s.str.isdigit()].astype(np.int8)
s.mean()

30.0

In [50]:
s = Series('10 20 30 1a 2b 3c'.split())
s

0    10
1    20
2    30
3    1a
4    2b
5    3c
dtype: object

In [51]:
s.astype(np.int8)

ValueError: invalid literal for int() with base 10: '1a'

In [52]:
int('ab', 16)

171

In [55]:
s.apply(lambda x: int(x,  16))

0    16
1    32
2    48
3    26
4    43
5    60
dtype: int64

# Data frame

Data frames are 2D

- Index / rows
- Columns 

In [56]:
# easiest way to create a data frame -- list of lists or
# a 2D NumPy array

df = DataFrame([[10, 20, 30, 40],
               [50, 60, 70, 80],
               [90, 100, 110, 120]],
              index=list('abc'),
              columns=list('wxyz'))
df

Unnamed: 0,w,x,y,z
a,10,20,30,40
b,50,60,70,80
c,90,100,110,120


In [57]:
# list of dicts

df = DataFrame([{'a':10, 'b':20, 'c':30},
               {'a':100, 'b':200, 'c':300},
               {'a':1000, 'b':2000, 'd':4000}])
df

Unnamed: 0,a,b,c,d
0,10,20,30.0,
1,100,200,300.0,
2,1000,2000,,4000.0


In [58]:
s.dtype

dtype('O')

In [59]:
df.dtypes

a      int64
b      int64
c    float64
d    float64
dtype: object

In [60]:
# defining a data frame as a dict of lists

df = DataFrame({'a':[10, 100, 1000],
               'b':[20, 200, 2000],
               'c':[30, 300, 3000]})
df

Unnamed: 0,a,b,c
0,10,20,30
1,100,200,300
2,1000,2000,3000


In [62]:
df = DataFrame({'a':[10, 100, 1000, 10000, 10001],
               'b':[20, 200, 2000, 20000, 200002],
               'c':[30, 300, 3000, 30000, 300003]},
              index=list('vwxyz'))

df

Unnamed: 0,a,b,c
v,10,20,30
w,100,200,300
x,1000,2000,3000
y,10000,20000,30000
z,10001,200002,300003


In [63]:
# get a column with []
df['a']

v       10
w      100
x     1000
y    10000
z    10001
Name: a, dtype: int64

In [64]:
# get two columns with [[]]
df[['a', 'b']]

Unnamed: 0,a,b
v,10,20
w,100,200
x,1000,2000
y,10000,20000
z,10001,200002


In [None]:
# what about the rows?
# .l