In [None]:
import ibis
import matplotlib.pyplot as plt
import os
import pandas as pd
ibis.options.interactive = True
pd.options.display.max_rows=1000

#### Server details

In [None]:
host = 'some_host'
port = '5432'
db = 'some_database'
user = os.environ['some_user']
pwd = os.environ['some_pwd']

#### Define ibis connection object

In [None]:
conn = ibis.postgres.connect(
    url=f'postgresql://{user}:{pwd}@{host}:{port}/{db}'
)

#### `conn` object has useful methods

In [None]:
conn.list_tables()

#### Let's create an ibis table object containing associates info

In [None]:
associates = conn.table('associate_master')

In [None]:
type(associates)

#### Let's find out the data type or schema of the columns

In [None]:
associates.schema()

#### Let's create a pandas dataframe so we can compare performance between the two later on

In [None]:
associates_df = associates.execute(limit=50000)

In [None]:
type(associates_df)

### Let's do some simple aggregations

#### Count of associates by division

In [None]:
# group associates table by HAM_TEAM_NAME_H column,
# then aggregate on HAM_TEAM_NAME_H column
# by doing a count on that column
# then sort by 'count' column in descending order
(associates.group_by('TEAM_NAME_H')
           .aggregate(associates['TEAM_NAME_H']
           .count())
           .sort_by(('count', False))
)

#### You may have noticed that `count` column was automatically created, but what if you want to call that column something else?

We can just add `.name('new_column_name')` right after the `.count()` invocation

In [None]:
# group associates table by HAM_TEAM_NAME_H column,
# then aggregate on HAM_TEAM_NAME_H column
# doing a count on that column
# name the count column 'Count of Associates' instead
# then sort by 'count' column in descending order
(associates.group_by('TEAM_NAME_H')
           .aggregate(associates['TEAM_NAME_H']
           .count().name('Count of Associates'))
           .sort_by(('Count of Associates', False))
)

#### Same thing as above, but using `value_counts()` inspired by pandas

The disadvantage of using `value_counts()` is you don't have the ability to make your own custom column name for count, you are stuck with the column name `count`

In [None]:
associates['TEAM_NAME_H'].value_counts().sort_by(('count', False))

#### Let's compare timings between ibis and pandas

```ibis```

In [None]:
%%timeit
associates['TEAM_NAME_H'].value_counts().sort_by(('count', False))

```pandas```

In [None]:
%%timeit
associates_df.groupby('TEAM_NAME_H').agg(Count=('TEAM_NAME_H', 'count'))

**Amazing, even though the pandas dataframe is in memory, it is still slower than ibis, which is computing the value counts at the database.**

#### Let's compare timings of their group by aggregation

In [None]:
%%timeit
(associates.group_by('TEAM_NAME_H')
           .aggregate(associates['TEAM_NAME_H']
           .count().name('Count'))
           .sort_by(('Count', False))
)

In [None]:
%%timeit
(associates_df.groupby('TEAM_NAME_H')
              .agg(Count=('TEAM_NAME_H', 'count'))
              .sort_values('Count', ascending=False)
)

#### Again, ibis is substantially outperforming pandas

## Data Munging with ibis

Going back to our associates data, let's do some more advance filtering

#### Boolean Filtering

Our associates table consists not just active associates, but also associates who have retired or were terminated.

Let's confirm using the `EMPL_ST_DESC` column

In [None]:
associates.EMPL_ST_DESCR.distinct()

In [None]:
associates.EMPL_ST_DESCR.value_counts()

#### Let's build criteria to include only active Honda associates and exclude those in 'Default' or 'NOT FOUND' divisions

To compose boolean expressions with `AND` or `OR`, use the respective `&` and `|` operators.  For set operations, we can use `.isin()` or `.notin()` methods

In [None]:
criteria1 = associates['EMPL_ST_DESCR'] == 'Active'
criteria2 = associates['TEAM_NAME_H'].notin(['Default','NOT FOUND'])

In [None]:
associates.filter(criteria1 & criteria2)['TEAM_NAME_H'].value_counts().sort_by(('count', False))

#### ibis has `cumsum()` function that we can use

In [None]:
counts = (associates.group_by('TEAM_NAME_H')
           .aggregate(associates['TEAM_NAME_H']
           .count().name('Count'))
           .sort_by(('Count', False))
)

In [None]:
counts[['TEAM_NAME_H']]

In [None]:
counts.mutate(CUM_COUNT=counts['Count'].cumsum())

#### Now, let's create a table of just active associates

In [None]:
active = associates.filter(criteria1 & criteria2)

In [None]:
active.count()

#### Let's join `active` table with table containing associates locker info

In [None]:
lockers = conn.table('associate_locker')

In [None]:
lockers

#### Let's perform an INNER join

In [None]:
inner_joined = active.inner_join(lockers, predicates=active['OPRID']==lockers['OPRID'])

#### Let's define what columns to return

In [None]:
inner_joined[lockers, active['NAME']]

#### To create a new column or modify an existing column, you would use the `mutate()` method

Let's make the `name` column all upper case with the `upper()` method.  Here is a list of all available string [methods](http://ibis-project.org/docs/api.html#string-methods) in ibis.

In [None]:
inner_joined[lockers, active['NAME']].mutate(NAME=active['NAME'].upper())

#### Using regex to extract text from one column to create a new column

In [None]:
import re

In [None]:
pattern = "^[^,]+"  # match one or more characters from beginning of string until the comma, but exclude comma

Extract last name from `NAME` column and create new column called `LAST_NAME` with it

In [None]:
lastnames = associates.mutate(LAST_NAME=associates['NAME'].re_extract(pattern, 0))['LAST_NAME','NAME']

In [None]:
lastnames

#### Using SQL's `LIKE`

In [None]:
associates.filter(associates['NAME'].like('Smith%'))

#### Obtaining date objects

In [None]:
associates['EFFDT'].day()

In [None]:
associates['EFFDT'].month()

In [None]:
associates['EFFDT'].year()

In [None]:
associates['EFFDT'].day_of_week.full_name()