In [1]:
import ibis
import matplotlib.pyplot as plt
import os
import pandas as pd
ibis.options.interactive = True
pd.options.display.max_rows=1000



#### Server details

In [2]:
host = 'blah_blah.somecompany.com'
port = '5432'
db = 'SC_Health_DB'
user = os.environ['sc_health_user']
pwd = os.environ['sc_health_pwd']

#### Define ibis connection object

In [3]:
conn = ibis.postgres.connect(
    url=f'postgresql://{user}:{pwd}@{host}:{port}/{db}'
)

#### `conn` object has useful methods

In [4]:
conn.list_tables()

['HOP_Trailer_Track_All_Feeds',
 'associate_assets',
 'associate_desks',
 'associate_devices',
 'associate_locker',
 'associate_master',
 'window_master',
 'window_master_excel',
 'wm_carriers',
 'wm_changepoints',
 'wm_destinations',
 'wm_return_routes',
 'wm_shuttles']

#### Let's create an ibis table object containing associates info

In [5]:
associates = conn.table('associate_master')

In [6]:
type(associates)

ibis.expr.types.TableExpr

#### Let's find out the data type or schema of the columns

In [7]:
associates.schema()

ibis.Schema {  
  EMPLID              string
  EMPL_RCD#           int64
  EFFDT               timestamp
  EFFSEQ              int64
  NAME                string
  PREFERRED_NAME      string
  COUNTY              string
  STATE               string
  COUNTRY             string
  PER_ORG             string
  PER_ORG_DESCR       string
  PER_ORG_ABBRV       string
  ORIG_HIRE_DT        timestamp
  HIRE_DT             timestamp
  WORK_PHONE          string
  WORK_EXT            string
  DEPTID              string
  JOBCODE             string
  EMPL_STATUS         string
  EMPL_ST_DESCR       string
  EMPL_ST_ABBRV       string
  EMPL_CLASS          string
  EMPL_CLS_DESCR      string
  EMPL_CLS_ABBRV      string
  LOCATION            string
  JOB_ENTRY_DT        timestamp
  DEPT_ENTRY_DT       timestamp
  SHIFT               string
  WORK_SHIFT_CODE     string
  REG_TEMP            string
  FULL_PART_TIME      string
  COMPANY             string
  BUSINESS_UNIT       string
  BUS_UNIT_NAM

#### Let's create a pandas dataframe so we can compare performance between the two later on

In [8]:
associates_df = associates.execute(limit=50000)

In [9]:
type(associates_df)

pandas.core.frame.DataFrame

### Let's do some simple aggregations

#### Count of associates by division

In [10]:
# group associates table by HAM_TEAM_NAME_H column,
# then aggregate on HAM_TEAM_NAME_H column
# by doing a count on that column
# then sort by 'count' column in descending order
(associates.group_by('HAM_TEAM_NAME_H')
           .aggregate(associates['HAM_TEAM_NAME_H']
           .count())
           .sort_by(('count', False))
)

                   HAM_TEAM_NAME_H  count
0                       MAP Line 1   6587
1       AEP Manufacturing Division   5259
2                       ELP Line 3   5098
3                        NOT FOUND   4364
4                       MAP Line 2   2061
5             HAM Quality Division   1397
6                       Purchasing   1375
7      Med Inactive Transition Div    851
8   Human Resource & Corp Services    606
9          Administration Division    495
10                         Default    488
11     Inactive HAM Purchasing Div    483
12     AEP Mfg, Planning & Control    429
13         Supply Chain Management    347
14    Planning & Strategy Division    290
15             NA Quality Division    257
16     MAP Mfg, Planning & Control    220
17     ELP Mfg, Planning & Control    193
18     Manufacturing Tech Division    179
19              New Model Strategy    175
20  Performance Manufacturing Cntr    128
21  Inactive Contract Services Div     76
22                      Accounting

#### You may have noticed that `count` column was automatically created, but what if you want to call that column something else?

We can just add `.name('new_column_name')` right after the `.count()` invocation

In [11]:
# group associates table by HAM_TEAM_NAME_H column,
# then aggregate on HAM_TEAM_NAME_H column
# doing a count on that column
# name the count column 'Count of Associates' instead
# then sort by 'count' column in descending order
(associates.group_by('HAM_TEAM_NAME_H')
           .aggregate(associates['HAM_TEAM_NAME_H']
           .count().name('Count of Associates'))
           .sort_by(('Count of Associates', False))
)

                   HAM_TEAM_NAME_H  Count of Associates
0                       MAP Line 1                 6587
1       AEP Manufacturing Division                 5259
2                       ELP Line 3                 5098
3                        NOT FOUND                 4364
4                       MAP Line 2                 2061
5             HAM Quality Division                 1397
6                       Purchasing                 1375
7      Med Inactive Transition Div                  851
8   Human Resource & Corp Services                  606
9          Administration Division                  495
10                         Default                  488
11     Inactive HAM Purchasing Div                  483
12     AEP Mfg, Planning & Control                  429
13         Supply Chain Management                  347
14    Planning & Strategy Division                  290
15             NA Quality Division                  257
16     MAP Mfg, Planning & Control              

#### Same thing as above, but using `value_counts()` inspired by pandas

The disadvantage of using `value_counts()` is you don't have the ability to make your own custom column name for count, you are stuck with the column name `count`

In [12]:
associates['HAM_TEAM_NAME_H'].value_counts().sort_by(('count', False))

                   HAM_TEAM_NAME_H  count
0                       MAP Line 1   6587
1       AEP Manufacturing Division   5259
2                       ELP Line 3   5098
3                        NOT FOUND   4364
4                       MAP Line 2   2061
5             HAM Quality Division   1397
6                       Purchasing   1375
7      Med Inactive Transition Div    851
8   Human Resource & Corp Services    606
9          Administration Division    495
10                         Default    488
11     Inactive HAM Purchasing Div    483
12     AEP Mfg, Planning & Control    429
13         Supply Chain Management    347
14    Planning & Strategy Division    290
15             NA Quality Division    257
16     MAP Mfg, Planning & Control    220
17     ELP Mfg, Planning & Control    193
18     Manufacturing Tech Division    179
19              New Model Strategy    175
20  Performance Manufacturing Cntr    128
21  Inactive Contract Services Div     76
22                      Accounting

#### Let's compare timings between ibis and pandas

```ibis```

In [13]:
%%timeit
associates['HAM_TEAM_NAME_H'].value_counts().sort_by(('count', False))

564 µs ± 51.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


```pandas```

In [14]:
%%timeit
associates_df.groupby('HAM_TEAM_NAME_H').agg(Count=('HAM_TEAM_NAME_H', 'count'))

17.9 ms ± 682 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


**Amazing, even though the pandas dataframe is in memory, it is still slower than ibis, which is computing the value counts at the database.**

#### Let's compare timings of their group by aggregation

In [15]:
%%timeit
(associates.group_by('HAM_TEAM_NAME_H')
           .aggregate(associates['HAM_TEAM_NAME_H']
           .count().name('Count'))
           .sort_by(('Count', False))
)

607 µs ± 41 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [16]:
%%timeit
(associates_df.groupby('HAM_TEAM_NAME_H')
              .agg(Count=('HAM_TEAM_NAME_H', 'count'))
              .sort_values('Count', ascending=False)
)

17.6 ms ± 679 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


#### Again, ibis is substantially outperforming pandas

## Data Munging with ibis

Going back to our associates data, let's do some more advance filtering

#### Boolean Filtering

Our associates table consists not just active associates, but also associates who have retired or were terminated.

Let's confirm using the `EMPL_ST_DESC` column

In [17]:
associates.EMPL_ST_DESCR.distinct()

0        Active
1      Deceased
2       Retired
3    Terminated
Name: EMPL_ST_DESCR, dtype: object

In [18]:
associates.EMPL_ST_DESCR.value_counts()

  EMPL_ST_DESCR  count
0        Active   9958
1      Deceased    480
2       Retired   6164
3    Terminated  14908

#### Let's build criteria to include only active Honda associates and exclude those in 'Default' or 'NOT FOUND' divisions

To compose boolean expressions with `AND` or `OR`, use the respective `&` and `|` operators.  For set operations, we can use `.isin()` or `.notin()` methods

In [19]:
criteria1 = associates['EMPL_ST_DESCR'] == 'Active'
criteria2 = associates['HAM_TEAM_NAME_H'].notin(['Default','NOT FOUND'])

In [20]:
associates.filter(criteria1 & criteria2)['HAM_TEAM_NAME_H'].value_counts().sort_by(('count', False))

                   HAM_TEAM_NAME_H  count
0       AEP Manufacturing Division   2284
1                       ELP Line 3   2112
2                       MAP Line 2   1513
3                       MAP Line 1   1475
4             HAM Quality Division    543
5                       Purchasing    425
6          Supply Chain Management    272
7   Human Resource & Corp Services    245
8      AEP Mfg, Planning & Control    196
9              NA Quality Division    172
10     MAP Mfg, Planning & Control    171
11     Manufacturing Tech Division    111
12     ELP Mfg, Planning & Control     96
13              New Model Strategy     82
14  Performance Manufacturing Cntr     75
15                      Accounting     49
16    Planning & Strategy Division     47
17     Med Inactive Transition Div     41
18  Alternative Job Assignment Div     10
19          Company Staff Division     10

#### ibis has `cumsum()` function that we can use

In [22]:
counts = (associates.group_by('HAM_TEAM_NAME_H')
           .aggregate(associates['HAM_TEAM_NAME_H']
           .count().name('Count'))
           .sort_by(('Count', False))
)

In [78]:
counts[['HAM_TEAM_NAME_H']]

                   HAM_TEAM_NAME_H
0                       MAP Line 1
1       AEP Manufacturing Division
2                       ELP Line 3
3                        NOT FOUND
4                       MAP Line 2
5             HAM Quality Division
6                       Purchasing
7      Med Inactive Transition Div
8   Human Resource & Corp Services
9          Administration Division
10                         Default
11     Inactive HAM Purchasing Div
12     AEP Mfg, Planning & Control
13         Supply Chain Management
14    Planning & Strategy Division
15             NA Quality Division
16     MAP Mfg, Planning & Control
17     ELP Mfg, Planning & Control
18     Manufacturing Tech Division
19              New Model Strategy
20  Performance Manufacturing Cntr
21  Inactive Contract Services Div
22                      Accounting
23          Company Staff Division
24  Alternative Job Assignment Div
25  Gov, Risk, Compliance & Ethics
26   Inactive HNAS - Company Staff

In [72]:
counts.mutate(CUM_COUNT=counts['Count'].cumsum())

                   HAM_TEAM_NAME_H  Count  CUM_COUNT
0                       MAP Line 1   6587       6587
1       AEP Manufacturing Division   5259      11846
2                       ELP Line 3   5098      16944
3                        NOT FOUND   4364      21308
4                       MAP Line 2   2061      23369
5             HAM Quality Division   1397      24766
6                       Purchasing   1375      26141
7      Med Inactive Transition Div    851      26992
8   Human Resource & Corp Services    606      27598
9          Administration Division    495      28093
10                         Default    488      28581
11     Inactive HAM Purchasing Div    483      29064
12     AEP Mfg, Planning & Control    429      29493
13         Supply Chain Management    347      29840
14    Planning & Strategy Division    290      30130
15             NA Quality Division    257      30387
16     MAP Mfg, Planning & Control    220      30607
17     ELP Mfg, Planning & Control    193     

#### Now, let's create a table of just active associates

In [46]:
active = associates.filter(criteria1 & criteria2)

In [47]:
active.count()

9929

#### Let's join `active` table with table containing associates locker info

In [48]:
lockers = conn.table('associate_locker')

In [49]:
lockers

       OPRID MMP Locker
0    MA15790     598 MU
1    MA05124     724 MU
2   VC044874     626 MU
3   VC090507      586MU
4   VC028053     578 MU
5   VC044020    79 MU-F
6   VC044888     659 MU
7    MA13771     582 MU
8   VC035337     775 MU
9    MA15332     656 MU
10  VC041321   161 MU-F
11  VC021160     628 MU
12   MM00873     646 MU
13  VC036556     619 MU
14   MA17558     641 MU
15  VC046908     959 MU
16  VC043333     700 MU
17  VC045094     728 MU
18  VC044114    26 MU-F
19   MM15158    41 MU-F
20  VC041630   151 MU-F
21  VC041632   150 MU-F
22  VC037914    16 MU-F
23  VC044120     676 MU
24  VC041877     695 MU
25  VC044177    no need
26  VC037294     640 MU
27   MA13808    46 MU-F
28   MT17423     618 MU
29  VC044264     577 MU
30  VC043334    no need
31  VC043568    81 MU-F
32   MA17411     649 MU
33  VC043336    no need
34  VC044806    no need
35  VC044696     608 MU
36  VC046734     958 MU
37   MA16397     699 MU
38  VC039909    88 MU-F
39  VC006078     MU 621
40   MT16369    

#### Let's perform an INNER join

In [50]:
inner_joined = active.inner_join(lockers, predicates=active['OPRID']==lockers['OPRID'])

#### Let's define what columns to return

In [None]:
inner_joined[lockers, active['NAME']]

#### To create a new column or modify an existing column, you would use the `mutate()` method

Let's make the `name` column all upper case with the `upper()` method.  Here is a list of all available string [methods](http://ibis-project.org/docs/api.html#string-methods) in ibis.

In [None]:
inner_joined[lockers, active['NAME']].mutate(NAME=active['NAME'].upper())

#### Using regex to extract text from one column to create a new column

In [53]:
import re

In [54]:
pattern = "^[^,]+"  # match one or more characters from beginning of string until the comma, but exclude comma

Extract last name from `NAME` column and create new column called `LAST_NAME` with it

In [55]:
lastnames = associates.mutate(LAST_NAME=associates['NAME'].re_extract(pattern, 0))['LAST_NAME','NAME']

In [None]:
lastnames

#### Using SQL's `LIKE`

In [None]:
associates.filter(associates['NAME'].like('Smouse%'))

#### Obtaining date objects

In [67]:
associates['EFFDT'].day()

0        9
1       17
2       30
3        2
4        4
        ..
9995    28
9996    15
9997     1
9998     1
9999    15
Name: tmp, Length: 10000, dtype: int32

In [68]:
associates['EFFDT'].month()

0        5
1        8
2       11
3       12
4        1
        ..
9995     5
9996     1
9997     3
9998     3
9999     6
Name: tmp, Length: 10000, dtype: int32

In [69]:
associates['EFFDT'].year()

0       1980
1       1979
2       1992
3       2002
4       1993
        ... 
9995    2004
9996    2014
9997    2019
9998    2014
9999    1990
Name: tmp, Length: 10000, dtype: int32

In [66]:
associates['EFFDT'].day_of_week.full_name()

0          Friday
1          Friday
2          Monday
3          Monday
4          Monday
          ...    
9995       Friday
9996    Wednesday
9997       Friday
9998     Saturday
9999       Friday
Name: tmp, Length: 10000, dtype: object