## Other Forms for Mutate

This section is a WIP

In [26]:
import pandas as pd
from dfply import *
import matplotlib.pylab as plt
%matplotlib inline

In [90]:
from pyspark.sql import SparkSession
from more_pyspark import get_spark_types, to_pandas

spark = SparkSession.builder.appName('Ops').getOrCreate()

## Hiding stack traceback

We hide the exception traceback for didactic reasons (code source: [see this post](https://stackoverflow.com/questions/46222753/how-do-i-suppress-tracebacks-in-jupyter)).  Don't run this cell if you want to see a full traceback.

In [85]:
import sys
ipython = get_ipython()

def hide_traceback(exc_tuple=None, filename=None, tb_offset=None,
                   exception_only=False, running_compiled_code=False):
    etype, value, tb = sys.exc_info()
    return ipython._showtraceback(etype, value, ipython.InteractiveTB.get_exception_only(etype, value))

ipython.showtraceback = hide_traceback

## Data set

We will be using two of the data sets provided by the Museam of Modern Art (MoMA) in this lecture.  Make sure that you have downloaded each repository.  [Download Instructions](./get_MOMA_data.ipynb)

## The exhibition file gives encoding errors by default

In [86]:
exhibitions = pd.read_csv('./data/MoMA_exhibitions/MoMAExhibitions1929to1989.csv')

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 1: invalid continuation byte

## Switching encodings fixes the problem

* See [this Stack Overflow question](https://stackoverflow.com/questions/18171739/unicodedecodeerror-when-reading-csv-file-in-pandas-with-python)
* More details on [ISO-8859-1](https://en.wikipedia.org/wiki/ISO/IEC_8859-1)

In [87]:
dat_cols = ['ExhibitionBeginDate', 'ExhibitionEndDate', 'ConstituentBeginDate' ,'ConstituentEndDate']
exhibitions = pd.read_csv('./data/MoMA_exhibitions/MoMAExhibitions1929to1989.csv', 
                          encoding="ISO-8859-1",
                          parse_dates=dat_cols)
exhibitions.head(2)

Unnamed: 0,ExhibitionID,ExhibitionNumber,ExhibitionTitle,ExhibitionCitationDate,ExhibitionBeginDate,ExhibitionEndDate,ExhibitionSortOrder,ExhibitionURL,ExhibitionRole,ExhibitionRoleinPressRelease,...,Institution,Nationality,ConstituentBeginDate,ConstituentEndDate,ArtistBio,Gender,VIAFID,WikidataID,ULANID,ConstituentURL
0,2557.0,1,"Cézanne, Gauguin, Seurat, Van Gogh","[MoMA Exh. #1, November 7-December 7, 1929]",1929-11-07,1929-12-07,1.0,moma.org/calendar/exhibitions/1767,Curator,Director,...,,American,1902,1981,"American, 19021981",Male,109252853.0,Q711362,500241556.0,moma.org/artists/9168
1,2557.0,1,"Cézanne, Gauguin, Seurat, Van Gogh","[MoMA Exh. #1, November 7-December 7, 1929]",1929-11-07,1929-12-07,1.0,moma.org/calendar/exhibitions/1767,Artist,Artist,...,,French,1839,1906,"French, 18391906",Male,39374836.0,Q35548,500004793.0,moma.org/artists/1053


#### MoMA Artists

In [88]:
artists = pd.read_csv("./data/Artists.csv")
artists.head(2)

Unnamed: 0,ConstituentID,DisplayName,ArtistBio,Nationality,Gender,BeginDate,EndDate,Wiki QID,ULAN
0,1,Robert Arneson,"American, 1930–1992",American,Male,1930,1992,,
1,2,Doroteo Arnaiz,"Spanish, born 1936",Spanish,Male,1936,0,,


In [91]:
artists_schema = get_spark_types(artists, keys=['ConstituentID'])

artists_spark = spark.createDataFrame(artists, schema=artists_schema)
(artists_spark
 .take(5)) >> to_pandas

Unnamed: 0,ConstituentID,DisplayName,ArtistBio,Nationality,Gender,BeginDate,EndDate,Wiki QID,ULAN
0,1,Robert Arneson,"American, 1930–1992",American,Male,1930,1992,,
1,2,Doroteo Arnaiz,"Spanish, born 1936",Spanish,Male,1936,0,,
2,3,Bill Arnold,"American, born 1941",American,Male,1941,0,,
3,4,Charles Arnoldi,"American, born 1946",American,Male,1946,0,Q1063584,500028000.0
4,5,Per Arnoldi,"Danish, born 1941",Danish,Male,1941,0,,


#### MoMA Artwork

In [92]:
from more_dfply import fix_names

artwork = (pd.read_csv("./data/Artworks.csv")
           >> fix_names
           >> mutate(id = X.index + 1)
          )
artwork.head(2)

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,Circumference_cm,Depth_cm,Diameter_cm,Height_cm,Length_cm,Weight_kg,Width_cm,Seat_Height_cm,Duration_sec,id
0,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",Otto Wagner,6210,"(Austrian, 1841–1918)",(Austrian),(1841),(1918),(Male),1896,Ink and cut-and-pasted painted pages on paper,...,,,,48.6,,,168.9,,,1
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,"(French, born 1944)",(French),(1944),(0),(Male),1987,Paint and colored pencil on print,...,,,,40.6401,,,29.8451,,,2


In [93]:
artwork_schema = get_spark_types(artwork, keys=['id'])

artwork_spark = spark.createDataFrame(artwork, schema=artwork_schema)
(artwork_spark
 .take(2)) >> to_pandas

Unnamed: 0,Title,Artist,ConstituentID,ArtistBio,Nationality,BeginDate,EndDate,Gender,Date,Medium,...,Circumference_cm,Depth_cm,Diameter_cm,Height_cm,Length_cm,Weight_kg,Width_cm,Seat_Height_cm,Duration_sec,id
0,"Ferdinandsbrücke Project, Vienna, Austria, Ele...",Otto Wagner,6210,"(Austrian, 1841–1918)",(Austrian),(1841),(1918),(Male),1896,Ink and cut-and-pasted painted pages on paper,...,,,,48.599998,,,168.899994,,,1
1,"City of Music, National Superior Conservatory ...",Christian de Portzamparc,7470,"(French, born 1944)",(French),(1944),(0),(Male),1987,Paint and colored pencil on print,...,,,,40.640099,,,29.8451,,,2


# Other forms of `mutate`

* Selective column mutation
    * `mutate_at` (TODO)
    * `mutate_all` (TODO)
    * `mutate_if`
* Selective row mutation
    * `mutate_only_if`
    * `mutate_if_else`

## Selectively mutating columns

* `mutate_if(df, pred, fun)` 
* `pred` asks questions about the column 
* `fun` will be applied to any column that passes `pred`
      

## Example - Switching all integer columns to `Int64` and removing `0`

In [33]:
from pandas import Int64Dtype
from numpy import nan

(artists
 >> mutate_if(lambda col: col.dtype.kind == 'i', lambda col: col.apply(lambda v: nan if v == 0 else v).astype('Int64'))
 >> head(2))

Unnamed: 0,ConstituentID,DisplayName,ArtistBio,Nationality,Gender,BeginDate,EndDate,Wiki QID,ULAN
0,1,Robert Arneson,"American, 1930–1992",American,Male,1930,1992.0,,
1,2,Doroteo Arnaiz,"Spanish, born 1936",Spanish,Male,1936,,,


## Refactoring the last example

In [34]:
is_int_col = lambda col: col.dtype.kind == 'i'
make_Int_and_remove_zero = lambda col: col.apply(lambda v: nan if v == 0 else v).astype('Int64')

(artists
 >> mutate_if(is_int_col, make_Int_and_remove_zero)
 >> head(2))

Unnamed: 0,ConstituentID,DisplayName,ArtistBio,Nationality,Gender,BeginDate,EndDate,Wiki QID,ULAN
0,1,Robert Arneson,"American, 1930–1992",American,Male,1930,1992.0,,
1,2,Doroteo Arnaiz,"Spanish, born 1936",Spanish,Male,1936,,,


In [58]:
artists.BeginDate.mask(artists.BeginDate == 0, nan).astype('Int64').head()

0    1930
1    1936
2    1941
3    1946
4    1941
Name: BeginDate, dtype: Int64

In [45]:
e = X.BeginDate.mask(X.BeginDate == 0, nan).astype('Int64')
e

<dfply.base.Intention at 0x1283b5d68>

In [47]:
e.evaluate(artists).head()

0    1930
1    1936
2    1941
3    1946
4    1941
Name: BeginDate, dtype: Int64

In [45]:
t = (X.BeginDate, X.BeginDate == 0, nan)
e = X.BeginDate.mask(X.BeginDate == 0, nan).astype('Int64')
e

<dfply.base.Intention at 0x1283b5d68>

In [55]:
from collections import namedtuple

Onlyif = namedtuple('Onlyif', ['col', 'pred', 'expr'])
tup = Onlyif(col = X.BeginDate, 
             pred = X.BeginDate == 0, 
             expr = nan)
tup.col, tup.pred, tup.expr

(<dfply.base.Intention at 0x1283c9128>,
 <dfply.base.Intention at 0x1289a0c50>,
 nan)

In [91]:
@make_symbolic
def onlyif(pred, col, expr):
    return col.mask(pred, expr)
e = onlyif(col = X.BeginDate, 
           pred = X.BeginDate == 0, 
           expr = nan).astype('Int64')
e.evaluate(artists).head()

0    1930
1    1936
2    1941
3    1946
4    1941
Name: BeginDate, dtype: Int64

In [150]:
@dfpipe
def mutate_only_if(df, **kwargs):
    masks = {name:tup_to_mask(tup) for name, tup in kwargs.items()}
    return df >> mutate(**masks)

In [101]:
@make_symbolic
def takeuntil(pred, seq):
    iseq = iter(seq)
    seen = False
    n = next(iseq)
    while not pred(n):
        yield n
        n = next(iseq)
    yield n

In [102]:
g = takeuntil(lambda x: x, [False, False, True, False, True])
g

<generator object takeuntil at 0x119cfc7c8>

In [103]:
list(g)

[False, False, True]

In [104]:
g = takeuntil(lambda x: x, [False, False, False, False, False])
g

<generator object takeuntil at 0x119cfcf48>

In [105]:
list(g)

RuntimeError: generator raised StopIteration

In [193]:
df = pd.DataFrame(np.random.randint(0, 10, size=(10, 3)), columns=list('abc'))
df.loc[::2, 'a'] = np.nan
df.loc[::3, 'b'] = np.nan
df

Unnamed: 0,a,b,c
0,,,0
1,7.0,9.0,3
2,,7.0,3
3,0.0,,3
4,,8.0,0
5,1.0,5.0,2
6,,,4
7,0.0,5.0,8
8,,2.0,1
9,5.0,,3


In [194]:
from functools import reduce

@make_symbolic
def coalesce(*args):
    args = list(takeuntil(lambda c: c.isna().all(), args))
    return reduce(lambda acc, n: acc.combine_first(n), args)

In [195]:
coalesce(df.a, df.b)

  """


0    NaN
1    7.0
2    7.0
3    0.0
4    8.0
5    1.0
6    NaN
7    0.0
8    2.0
9    5.0
Name: a, dtype: float64

In [197]:
coalesce(df.a, df.b, df.c)

  """


0    0.0
1    7.0
2    7.0
3    0.0
4    8.0
5    1.0
6    4.0
7    0.0
8    2.0
9    5.0
Name: a, dtype: float64

In [200]:
coalesce(X.a, X.b).evaluate(df)

  """


0    NaN
1    7.0
2    7.0
3    0.0
4    8.0
5    1.0
6    NaN
7    0.0
8    2.0
9    5.0
Name: a, dtype: float64

In [199]:
coalesce(X.a, X.b, X.c).evaluate(df)

  """


0    0.0
1    7.0
2    7.0
3    0.0
4    8.0
5    1.0
6    4.0
7    0.0
8    2.0
9    5.0
Name: a, dtype: float64

In [187]:
df.a.isna().all()

False

In [188]:
np.where(True, artists.BeginDate, artists.EndDate)

array([1930, 1936, 1941, ..., 1978, 1938, 1834])

In [112]:
class PredIntention(Intention):
    def __init__(self, pred, then):
        self.pred = pred
        self.then = then
        
        
    def all(self):
        return self.pred.all()
    
    
    def any(self):
        return self.pred.any()
    
    
    def mask(self, other=nan):
        return self.then.mask(self.pred, other=other)
    
    
    def where(self, other=nan):
        return self.then.where(self.pred, other=other)

In [137]:
args = [(X.Nationality == 'American',
         )]

@make_symbolic
def case_when(*args):
    

In [141]:
?artists.BeginDate.combine_first

In [201]:
p.where().evaluate(artists)

0        NaN
1        NaN
2        NaN
3        NaN
4        NaN
5        NaN
6        NaN
7        NaN
8        NaN
9        NaN
10       NaN
11       NaN
12       NaN
13       NaN
14       5.0
15       NaN
16       NaN
17       5.0
18       NaN
19       NaN
20       NaN
21       NaN
22       NaN
23       NaN
24       NaN
25       NaN
26       5.0
27       5.0
28       NaN
29       NaN
        ... 
15787    NaN
15788    NaN
15789    NaN
15790    NaN
15791    NaN
15792    5.0
15793    NaN
15794    NaN
15795    NaN
15796    5.0
15797    NaN
15798    NaN
15799    NaN
15800    NaN
15801    NaN
15802    NaN
15803    NaN
15804    NaN
15805    NaN
15806    NaN
15807    NaN
15808    NaN
15809    NaN
15810    NaN
15811    NaN
15812    NaN
15813    5.0
15814    NaN
15815    NaN
15816    NaN
Name: BeginDate, Length: 15817, dtype: float64

In [126]:
(Y.BeginDate == 0) @ 2

TypeError: unsupported operand type(s) for @: 'Intention' and 'int'

In [120]:
(Y.BeginDate == 0) @ Y.BeginDate.apply(lambda v: nan)

TypeError: unsupported operand type(s) for @: 'Intention' and 'Intention'

In [50]:
exhibitions.head(2)

Unnamed: 0,ExhibitionID,ExhibitionNumber,ExhibitionTitle,ExhibitionCitationDate,ExhibitionBeginDate,ExhibitionEndDate,ExhibitionSortOrder,ExhibitionURL,ExhibitionRole,ExhibitionRoleinPressRelease,...,Institution,Nationality,ConstituentBeginDate,ConstituentEndDate,ArtistBio,Gender,VIAFID,WikidataID,ULANID,ConstituentURL
0,2557.0,1,"Cézanne, Gauguin, Seurat, Van Gogh","[MoMA Exh. #1, November 7-December 7, 1929]",1929-11-07,1929-12-07,1.0,moma.org/calendar/exhibitions/1767,Curator,Director,...,,American,1902,1981,"American, 19021981",Male,109252853.0,Q711362,500241556.0,moma.org/artists/9168
1,2557.0,1,"Cézanne, Gauguin, Seurat, Van Gogh","[MoMA Exh. #1, November 7-December 7, 1929]",1929-11-07,1929-12-07,1.0,moma.org/calendar/exhibitions/1767,Artist,Artist,...,,French,1839,1906,"French, 18391906",Male,39374836.0,Q35548,500004793.0,moma.org/artists/1053


In [63]:
exhibitions.ExhibitionEndDate - exhibitions.ExhibitionBeginDate

0        30 days
1        30 days
2        30 days
3        30 days
4        30 days
5        31 days
6        31 days
7        31 days
8        31 days
9        31 days
10       31 days
11       31 days
12       31 days
13       31 days
14       31 days
15       31 days
16       31 days
17       31 days
18       31 days
19       31 days
20       31 days
21       31 days
22       31 days
23       31 days
24       42 days
25       42 days
26       42 days
27       42 days
28       42 days
29       42 days
          ...   
34528   117 days
34529   117 days
34530   117 days
34531   117 days
34532   117 days
34533   117 days
34534    34 days
34535   128 days
34536   128 days
34537   128 days
34538   128 days
34539   128 days
34540   128 days
34541   128 days
34542   128 days
34543   128 days
34544   128 days
34545   128 days
34546   128 days
34547   128 days
34548   128 days
34549   128 days
34550   128 days
34551   128 days
34552   128 days
34553   128 days
34554   128 days
34555   128 da

In [79]:
pd.Timedelta(pd.offsets.Day(5))

Timedelta('5 days 00:00:00')

In [81]:
from more_dfply import ifelse

(exhibitions 
 >> select(X.ExhibitionBeginDate, X.ExhibitionEndDate)
 >> mutate(ExhibitionDuration = X.ExhibitionEndDate - X.ExhibitionBeginDate)
 >> filter_by(X.ExhibitionDuration > pd.Timedelta(pd.offsets.Day(365)))
 >> head)

Unnamed: 0,ExhibitionBeginDate,ExhibitionEndDate,ExhibitionDuration
4790,1941-09-30,1943-07-28,666 days
4791,1941-09-30,1943-07-28,666 days
4797,1941-10-21,1944-04-30,922 days
4798,1941-10-21,1944-04-30,922 days
4799,1941-10-21,1944-04-30,922 days


In [58]:
pd.Inter

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [100]:
(artists 
 >> mutate(BeginDate = (onlyif(pred=X.BeginDate == 0, 
                               col=X.BeginDate, 
                               expr=X.BeginDate.apply(lambda v: nan))
                        .astype('Int64')),
           EndDate   = (onlyif(pred=X.EndDate == 0, 
                               col=X.EndDate, 
                               expr=X.EndDate.apply(lambda v: nan))
                        .astype('Int64'))
          )
 >> head)

Unnamed: 0,ConstituentID,DisplayName,ArtistBio,Nationality,Gender,BeginDate,EndDate,Wiki QID,ULAN
0,1,Robert Arneson,"American, 1930–1992",American,Male,1930,1992.0,,
1,2,Doroteo Arnaiz,"Spanish, born 1936",Spanish,Male,1936,,,
2,3,Bill Arnold,"American, born 1941",American,Male,1941,,,
3,4,Charles Arnoldi,"American, born 1946",American,Male,1946,,Q1063584,500027998.0
4,5,Per Arnoldi,"Danish, born 1941",Danish,Male,1941,,,


In [81]:
mutate_only_if(artists, new_begin_date = Onlyif(col=X.BeginDate,
                                           pred=X.BeginDate ==0,
                                           expr=nan))

<dfply.base.pipe at 0x1287f0278>