## Handling Missing Data

In [1]:
import numpy as np
import pandas as pd

In [2]:
vals2 = np.array([1, np.nan, 3, 4]) 
vals2.sum()

nan

In [3]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [4]:
1 + np.nan

nan

In [5]:
0 *  np.nan

nan

In [6]:
vals2.sum(), vals2.min(), vals2.max()

(nan, nan, nan)

In [7]:
# NaN 처리
np.nansum(vals2), np.nanmin(vals2), np.nanmax(vals2)

(8.0, 1.0, 4.0)

### NaN and None in Pandas

- upcasting & type conversion

In [9]:
# 숫자일 경우 : NaN으로 변환
df = pd.DataFrame({'value' : [1, np.nan, 2, None]})
df

Unnamed: 0,value
0,1.0
1,
2,2.0
3,


In [10]:
df.dtypes

value    float64
dtype: object

In [11]:
# boolean인 경우 Object
df = pd.DataFrame({'value' : [True, np.nan, 2, None]})
df

Unnamed: 0,value
0,True
1,
2,2
3,


In [12]:
df.dtypes

value    object
dtype: object

In [13]:
df = pd.DataFrame({'value' : ['abc', np.nan, 2, None]})
df

Unnamed: 0,value
0,abc
1,
2,2
3,


In [14]:
df.dtypes

value    object
dtype: object

In [15]:
df = pd.DataFrame({'value' : [1, 3, 2, 4]})
print(df.dtypes)
print('------------')
df = pd.DataFrame({'value' : [np.nan, 3, 2, 4]})
print(df.dtypes)

value    int64
dtype: object
------------
value    float64
dtype: object


## Operating on Null Values

- ``isnull()``: Generate a boolean mask indicating missing values
- ``notnull()``: Opposite of ``isnull()``
- ``dropna()``: Return a filtered version of the data
- ``fillna()``: Return a copy of the data with missing values filled or imputed

### Detecting null values

In [17]:
df = pd.DataFrame({'value' : [1, np.nan, 2, None]})
df

Unnamed: 0,value
0,1.0
1,
2,2.0
3,


In [18]:
df.isnull()

Unnamed: 0,value
0,False
1,True
2,False
3,True


In [19]:
df.notnull()

Unnamed: 0,value
0,True
1,False
2,True
3,False


In [20]:
df[df.isnull()]

Unnamed: 0,value
0,
1,
2,
3,


In [21]:
df[df.notnull()]

Unnamed: 0,value
0,1.0
1,
2,2.0
3,


### Dropping null values

In [22]:
df = pd.DataFrame([[1,      np.nan, 2],
                   [2,      3,      5],
                   [np.nan, 4,      6]])
df

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [23]:
df.dropna()

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [24]:
df.dropna(axis='columns')

Unnamed: 0,2
0,2
1,5
2,6


In [25]:
# column drop
df.dropna(axis=1)

Unnamed: 0,2
0,2
1,5
2,6


In [26]:
# row drop
df.dropna(axis=0) # df.dropna(axis='rows')

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [27]:
df.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [28]:
df.dropna(how='any')

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [29]:
# 허용치
df.dropna(axis='rows', thresh=1)

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [30]:
df.dropna(axis='rows', thresh=2)

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


### Filling null values

In [31]:
data = df[:]
data

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,,4.0,6


In [32]:
# fill NA
data.fillna(0)

Unnamed: 0,0,1,2
0,1.0,0.0,2
1,2.0,3.0,5
2,0.0,4.0,6


In [33]:
# forward-fill
data.fillna(method='ffill')

Unnamed: 0,0,1,2
0,1.0,,2
1,2.0,3.0,5
2,2.0,4.0,6


In [34]:
data.fillna(method='ffill', axis=1)

Unnamed: 0,0,1,2
0,1.0,1.0,2.0
1,2.0,3.0,5.0
2,,4.0,6.0


In [35]:
# back-fill
data.fillna(method='bfill')

Unnamed: 0,0,1,2
0,1.0,3.0,2
1,2.0,3.0,5
2,,4.0,6


In [34]:
data.fillna(method='bfill', axis=1)

Unnamed: 0,0,1,2
0,1.0,2.0,2.0
1,2.0,3.0,5.0
2,4.0,4.0,6.0


## Pivot Tables

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
titanic = sns.load_dataset('titanic')

In [2]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


## Pivot Tables by Hand

In [4]:
titanic.groupby('sex').mean()

titanic.groupby('sex')['survived'].mean()

titanic.groupby('sex')[['survived']].mean()

titanic.groupby(['sex', 'class'])['survived'].aggregate('mean')

titanic.groupby(['sex', 'class'])['survived'].aggregate('mean').unstack()

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


## Pivot Table Syntax

In [5]:
titanic.pivot_table('survived', index='sex', columns='class')

class,First,Second,Third
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0.968085,0.921053,0.5
male,0.368852,0.157407,0.135447


### Multi-level pivot tables

In [6]:
age = pd.cut(titanic['age'], [0, 18, 80])
age.head(10)

0    (18.0, 80.0]
1    (18.0, 80.0]
2    (18.0, 80.0]
3    (18.0, 80.0]
4    (18.0, 80.0]
5             NaN
6    (18.0, 80.0]
7     (0.0, 18.0]
8    (18.0, 80.0]
9     (0.0, 18.0]
Name: age, dtype: category
Categories (2, interval[int64]): [(0, 18] < (18, 80]]

In [7]:
titanic.pivot_table('survived', ['sex', age], 'class')

Unnamed: 0_level_0,class,First,Second,Third
sex,age,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,"(0, 18]",0.909091,1.0,0.511628
female,"(18, 80]",0.972973,0.9,0.423729
male,"(0, 18]",0.8,0.6,0.215686
male,"(18, 80]",0.375,0.071429,0.133663


In [8]:
fare = pd.qcut(titanic['fare'], 4)
fare.unique()

[(-0.001, 7.91], (31.0, 512.329], (7.91, 14.454], (14.454, 31.0]]
Categories (4, interval[float64]): [(-0.001, 7.91] < (7.91, 14.454] < (14.454, 31.0] < (31.0, 512.329]]

In [9]:
titanic.pivot_table('survived', ['sex', age], [fare, 'class'])

Unnamed: 0_level_0,fare,"(-0.001, 7.91]","(-0.001, 7.91]","(7.91, 14.454]","(7.91, 14.454]","(14.454, 31.0]","(14.454, 31.0]","(14.454, 31.0]","(31.0, 512.329]","(31.0, 512.329]","(31.0, 512.329]"
Unnamed: 0_level_1,class,First,Third,Second,Third,First,Second,Third,First,Second,Third
sex,age,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2
female,"(0, 18]",,0.625,1.0,0.769231,,1.0,0.4,0.909091,1.0,0.142857
female,"(18, 80]",,0.6,0.88,0.333333,0.875,0.9,0.470588,0.984848,1.0,0.166667
male,"(0, 18]",,0.0,0.0,0.461538,,0.857143,0.266667,0.8,0.75,0.076923
male,"(18, 80]",0.0,0.103774,0.098039,0.157143,0.5,0.041667,0.095238,0.333333,0.0,0.6


### Additional pivot table options

```python
# call signature as of Pandas >= 0.18 
DataFrame.pivot_table(data, values=None, index=None, columns=None,
                      aggfunc='mean', fill_value=None, margins=False,
                      dropna=True, margins_name='All')
```

In [10]:
titanic.pivot_table(index='sex', columns='class',
                    aggfunc={'survived':'sum', 'fare': 'mean'})

Unnamed: 0_level_0,fare,fare,fare,survived,survived,survived
class,First,Second,Third,First,Second,Third
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
female,106.125798,21.970121,16.11881,91,70,72
male,67.226127,19.741782,12.661633,45,17,47


### String Vector 연산
|             |                  |                  |                  |
|-------------|------------------|------------------|------------------|
|``len()``    | ``lower()``      | ``translate()``  | ``islower()``    | 
|``ljust()``  | ``upper()``      | ``startswith()`` | ``isupper()``    | 
|``rjust()``  | ``find()``       | ``endswith()``   | ``isnumeric()``  | 
|``center()`` | ``rfind()``      | ``isalnum()``    | ``isdecimal()``  | 
|``zfill()``  | ``index()``      | ``isalpha()``    | ``split()``      | 
|``strip()``  | ``rindex()``     | ``isdigit()``    | ``rsplit()``     | 
|``rstrip()`` | ``capitalize()`` | ``isspace()``    | ``partition()``  | 
|``lstrip()`` |  ``swapcase()``  |  ``istitle()``   | ``rpartition()`` |

In [11]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [12]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [13]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [14]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [15]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

#### -  정규표현식 (regular expressions)

| Method | Description |
|--------|-------------|
| ``match()`` | Call ``re.match()`` on each element, returning a boolean. |
| ``extract()`` | Call ``re.match()`` on each element, returning matched groups as strings.|
| ``findall()`` | Call ``re.findall()`` on each element |
| ``replace()`` | Replace occurrences of pattern with some other string|
| ``contains()`` | Call ``re.search()`` on each element, returning a boolean |
| ``count()`` | Count occurrences of pattern|
| ``split()``   | Equivalent to ``str.split()``, but accepts regexps |
| ``rsplit()`` | Equivalent to ``str.rsplit()``, but accepts regexps |

In [18]:
monte.str.extract('([A-Za-z]+)', expand=False)

0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

In [21]:
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

#### - 기타 함수 

| Method | Description |
|--------|-------------|
| ``get()`` | Index each element |
| ``slice()`` | Slice each element|
| ``slice_replace()`` | Replace slice in each element with passed value|
| ``cat()``      | Concatenate strings|
| ``repeat()`` | Repeat values |
| ``normalize()`` | Return Unicode form of string |
| ``pad()`` | Add whitespace to left, right, or both sides of strings|
| ``wrap()`` | Split long strings into lines with length less than a given width|
| ``join()`` | Join strings in each element of the Series with passed separator|
| ``get_dummies()`` | extract dummy variables as a dataframe |

In [22]:
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [23]:
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [24]:
full_monte = pd.DataFrame({'name': monte,
                           'info': ['B|C|D', 'B|D', 'A|C',
                                    'B|D', 'B|C', 'B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


#### - ``get_dummies()`` : encoding 

In [25]:
full_monte['info'].str.get_dummies('|')

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


In [27]:
from datetime import datetime
x = datetime(year=2019, month=10, day=1)

In [40]:
x.year
x.month
x.day
x.hour
x.minute
x.second

0

In [41]:
from dateutil import parser
date = parser.parse("1 Oct, 2015")
date

datetime.datetime(2015, 10, 1, 0, 0)

In [42]:
date = parser.parse("2015 10 1")
date

datetime.datetime(2015, 10, 1, 0, 0)

In [43]:
date.strftime('%A')

'Thursday'

### NumPy's ``datetime64``

In [44]:
import numpy as np
date = np.array('2021-05-01', dtype=np.datetime64)
date

array('2021-05-01', dtype='datetime64[D]')

In [45]:
# vector 연산
date + np.arange(12)

array(['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04',
       '2021-05-05', '2021-05-06', '2021-05-07', '2021-05-08',
       '2021-05-09', '2021-05-10', '2021-05-11', '2021-05-12'],
      dtype='datetime64[D]')

In [48]:
np.datetime64('2021-05-01')

numpy.datetime64('2021-05-01')

In [51]:
np.datetime64('2021-05-01 12:59:59.50', 'ns')

numpy.datetime64('2021-05-01T12:59:59.500000000')

## Numpy Date type
|Code    | Meaning     | Time span (relative) | Time span (absolute)   |
|--------|-------------|----------------------|------------------------|
| ``Y``  | Year	       | ± 9.2e18 years       | [9.2e18 BC, 9.2e18 AD] |
| ``M``  | Month       | ± 7.6e17 years       | [7.6e17 BC, 7.6e17 AD] |
| ``W``  | Week	       | ± 1.7e17 years       | [1.7e17 BC, 1.7e17 AD] |
| ``D``  | Day         | ± 2.5e16 years       | [2.5e16 BC, 2.5e16 AD] |
| ``h``  | Hour        | ± 1.0e15 years       | [1.0e15 BC, 1.0e15 AD] |
| ``m``  | Minute      | ± 1.7e13 years       | [1.7e13 BC, 1.7e13 AD] |
| ``s``  | Second      | ± 2.9e12 years       | [ 2.9e9 BC, 2.9e9 AD]  |
| ``ms`` | Millisecond | ± 2.9e9 years        | [ 2.9e6 BC, 2.9e6 AD]  |
| ``us`` | Microsecond | ± 2.9e6 years        | [290301 BC, 294241 AD] |
| ``ns`` | Nanosecond  | ± 292 years          | [ 1678 AD, 2262 AD]    |
| ``ps`` | Picosecond  | ± 106 days           | [ 1969 AD, 1970 AD]    |
| ``fs`` | Femtosecond | ± 2.6 hours          | [ 1969 AD, 1970 AD]    |
| ``as`` | Attosecond  | ± 9.2 seconds        | [ 1969 AD, 1970 AD]    |

### Dates and times in pandas

In [53]:
import pandas as pd
date = pd.to_datetime("1 may, 2021")
date

Timestamp('2021-05-01 00:00:00')

In [54]:
date.strftime('%A')

'Saturday'

In [55]:
date + pd.to_timedelta(np.arange(12), 'D')

DatetimeIndex(['2021-05-01', '2021-05-02', '2021-05-03', '2021-05-04',
               '2021-05-05', '2021-05-06', '2021-05-07', '2021-05-08',
               '2021-05-09', '2021-05-10', '2021-05-11', '2021-05-12'],
              dtype='datetime64[ns]', freq=None)

In [56]:
index = pd.DatetimeIndex(['2018-10-01', '2019-10-01',
                          '2018-11-01', '2019-11-01'])
data = pd.Series([0, 1, 2, 3], index=index)
data

2018-10-01    0
2019-10-01    1
2018-11-01    2
2019-11-01    3
dtype: int64

In [57]:
# 시간 범위
data['2018-01-01':'2018-12-31']

  data['2018-01-01':'2018-12-31']


2018-10-01    0
2018-11-01    2
dtype: int64

In [58]:
data['2019']

2019-10-01    1
2019-11-01    3
dtype: int64

In [59]:
dates = pd.to_datetime([datetime(2019, 10, 1), '2 Oct, 2019',
                       '2019-OCt-3', '10-04-2015', '20191005'])
dates

DatetimeIndex(['2019-10-01', '2019-10-02', '2019-10-03', '2015-10-04',
               '2019-10-05'],
              dtype='datetime64[ns]', freq=None)

In [60]:
# 주기 지정
dates.to_period('D')

PeriodIndex(['2019-10-01', '2019-10-02', '2019-10-03', '2015-10-04',
             '2019-10-05'],
            dtype='period[D]', freq='D')

In [61]:
# 시간 차이
dates - dates[0]

TimedeltaIndex(['0 days', '1 days', '2 days', '-1458 days', '4 days'], dtype='timedelta64[ns]', freq=None)

### ``pd.date_range()``

In [62]:
pd.date_range('2019-10-01', '2019-10-31')

DatetimeIndex(['2019-10-01', '2019-10-02', '2019-10-03', '2019-10-04',
               '2019-10-05', '2019-10-06', '2019-10-07', '2019-10-08',
               '2019-10-09', '2019-10-10', '2019-10-11', '2019-10-12',
               '2019-10-13', '2019-10-14', '2019-10-15', '2019-10-16',
               '2019-10-17', '2019-10-18', '2019-10-19', '2019-10-20',
               '2019-10-21', '2019-10-22', '2019-10-23', '2019-10-24',
               '2019-10-25', '2019-10-26', '2019-10-27', '2019-10-28',
               '2019-10-29', '2019-10-30', '2019-10-31'],
              dtype='datetime64[ns]', freq='D')

In [63]:
# error (날짜 오류)
pd.date_range('2019-09-01', '2019-09-31')

ValueError: could not convert string to Timestamp

In [64]:
pd.date_range('2019-09-29', periods=3)

DatetimeIndex(['2019-09-29', '2019-09-30', '2019-10-01'], dtype='datetime64[ns]', freq='D')

In [65]:
pd.date_range('2019-10-01', periods=8, freq='H')

DatetimeIndex(['2019-10-01 00:00:00', '2019-10-01 01:00:00',
               '2019-10-01 02:00:00', '2019-10-01 03:00:00',
               '2019-10-01 04:00:00', '2019-10-01 05:00:00',
               '2019-10-01 06:00:00', '2019-10-01 07:00:00'],
              dtype='datetime64[ns]', freq='H')

In [66]:
pd.period_range('2019-10', periods=8, freq='M')

PeriodIndex(['2019-10', '2019-11', '2019-12', '2020-01', '2020-02', '2020-03',
             '2020-04', '2020-05'],
            dtype='period[M]', freq='M')

In [67]:
pd.timedelta_range(0, periods=10, freq='H')

TimedeltaIndex(['0 days 00:00:00', '0 days 01:00:00', '0 days 02:00:00',
                '0 days 03:00:00', '0 days 04:00:00', '0 days 05:00:00',
                '0 days 06:00:00', '0 days 07:00:00', '0 days 08:00:00',
                '0 days 09:00:00'],
               dtype='timedelta64[ns]', freq='H')

#### - Frequencies and Offsets

| Code   | Description         | Code   | Description          |
|--------|---------------------|--------|----------------------|
| ``D``  | Calendar day        | ``B``  | Business day         |
| ``W``  | Weekly              |        |                      |
| ``M``  | Month end           | ``BM`` | Business month end   |
| ``Q``  | Quarter end         | ``BQ`` | Business quarter end |
| ``A``  | Year end            | ``BA`` | Business year end    |
| ``H``  | Hours               | ``BH`` | Business hours       |
| ``T``  | Minutes             |        |                      |
| ``S``  | Seconds             |        |                      |
| ``L``  | Milliseonds         |        |                      |
| ``U``  | Microseconds        |        |                      |
| ``N``  | nanoseconds         |        |                      |


| Code    | Description            || Code    | Description            |
|---------|------------------------||---------|------------------------|
| ``MS``  | Month start            ||``BMS``  | Business month start   |
| ``QS``  | Quarter start          ||``BQS``  | Business quarter start |
| ``AS``  | Year start             ||``BAS``  | Business year start    |

In [69]:
pd.timedelta_range(0, periods=9, freq="2H30T")

TimedeltaIndex(['0 days 00:00:00', '0 days 02:30:00', '0 days 05:00:00',
                '0 days 07:30:00', '0 days 10:00:00', '0 days 12:30:00',
                '0 days 15:00:00', '0 days 17:30:00', '0 days 20:00:00'],
               dtype='timedelta64[ns]', freq='150T')