# Idiomatic Pandas

© MetaSnake 2022, CC BY-NC

In [2]:
import glob

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# !pip install pandas matplotlib

In [3]:
pd.__version__

'1.4.3'

In [4]:
pd.show_versions()




INSTALLED VERSIONS
------------------
commit           : e8093ba372f9adfe79439d90fe74b0b5b6dea9d6
python           : 3.8.13.final.0
python-bits      : 64
OS               : Windows
OS-release       : 10
Version          : 10.0.19043
machine          : AMD64
processor        : Intel64 Family 6 Model 142 Stepping 9, GenuineIntel
byteorder        : little
LC_ALL           : None
LANG             : None
LOCALE           : English_United States.1252

pandas           : 1.4.3
numpy            : 1.22.3
pytz             : 2022.1
dateutil         : 2.8.2
setuptools       : 61.2.0
pip              : 22.1.2
Cython           : 0.29.24
pytest           : None
hypothesis       : None
sphinx           : None
blosc            : None
feather          : None
xlsxwriter       : None
lxml.etree       : 4.9.1
html5lib         : None
pymysql          : None
psycopg2         : None
jinja2           : 3.0.3
IPython          : 8.4.0
pandas_datareader: None
bs4              : 4.11.1
bottleneck       : 1.3.5
br

## Loading Data

In [1]:
!ls *.csv

ls: cannot access '*.csv': No such file or directory


In [None]:
data = [pd.read_csv(f, parse_dates=['time'], na_values='-') for f in glob.glob('tweet_activity_metrics___mharrison___*')]
df = pd.concat(data, ignore_index=True).sort_values('time')
df

In [None]:
df.to_csv('__mharrison__2020-2021.csv', index=False)

In [None]:
pd.read_csv('__mharrison__2020-2021.csv')

## Load data from Web

In [5]:
url = 'https://github.com/mattharrison/datasets/raw/master/data/__mharrison__2020-2021.csv'
df = pd.read_csv(url, parse_dates=['time'])

In [6]:
df

Unnamed: 0,Tweet id,Tweet permalink,Tweet text,time,impressions,engagements,engagement rate,retweets,replies,likes,...,promoted hashtag clicks,promoted detail expands,promoted permalink clicks,promoted app opens,promoted app installs,promoted follows,promoted email tweet,promoted dial phone,promoted media views,promoted media engagements
0,1212580517905780737,https://twitter.com/__mharrison__/status/12125...,Sounds like a great topic! https://t.co/f8bZbA...,2020-01-02 03:44:00+00:00,1465.0,7.0,0.004778,0.0,0.0,3.0,...,,,,,,,,,,
1,1212582494828036097,https://twitter.com/__mharrison__/status/12125...,@FogleBird Looks like SLC. I can see my 🏠,2020-01-02 03:52:00+00:00,154.0,3.0,0.019481,0.0,0.0,1.0,...,,,,,,,,,,
2,1212613735698690049,https://twitter.com/__mharrison__/status/12126...,@afilina That's really amount and frustrating....,2020-01-02 05:56:00+00:00,1024.0,6.0,0.005859,0.0,0.0,1.0,...,,,,,,,,,,
3,1212911749617242113,https://twitter.com/__mharrison__/status/12129...,"@randal_olson I use anaconda when teaching, bu...",2020-01-03 01:41:00+00:00,1419.0,14.0,0.009866,0.0,1.0,5.0,...,,,,,,,,,,
4,1212920556028252160,https://twitter.com/__mharrison__/status/12129...,@AlSweigart Sometimes the students aren't moti...,2020-01-03 02:16:00+00:00,198.0,1.0,0.005051,0.0,0.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5786,1475300661851934721,https://twitter.com/__mharrison__/status/14753...,@allison_horst That's awesome!,2021-12-27 03:01:00+00:00,986.0,1.0,0.001014,0.0,0.0,0.0,...,,,,,,,,,,
5787,1475518143690801156,https://twitter.com/__mharrison__/status/14755...,@willmcgugan You need to find out what works f...,2021-12-27 17:25:00+00:00,1790.0,7.0,0.003911,0.0,0.0,3.0,...,,,,,,,,,,
5788,1475891441243025408,https://twitter.com/__mharrison__/status/14758...,@posco Visiting Hawaii for the holidays. Lots ...,2021-12-28 18:08:00+00:00,1611.0,12.0,0.007449,0.0,0.0,4.0,...,,,,,,,,,,
5789,1476453819751878656,https://twitter.com/__mharrison__/status/14764...,@johndsaunders My son just built this.,2021-12-30 07:23:00+00:00,1354.0,8.0,0.005908,0.0,0.0,2.0,...,,,,,,,,,,


## Load Data Exercise

* Load the data using the cell above.
* If you can't do this please alert!

## Exploring

Definitions

* *Impressions* - Number of times people saw the tweet
* *Engagements* - Number of "interactions" (clicks, replies, retweets, likes)
* *Engagement rate* - Engagements divided by impressions

In [7]:
df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,5781,5782,5783,5784,5785,5786,5787,5788,5789,5790
Tweet id,1212580517905780737,1212582494828036097,1212613735698690049,1212911749617242113,1212920556028252160,1212921038356434947,1212921192287330304,1212969747186208768,1212971634019033088,1213194379793293312,...,1475023008620154882,1475299210291343362,1475299401681682434,1475299848093978626,1475300370138087426,1475300661851934721,1475518143690801156,1475891441243025408,1476453819751878656,1477024722051158018
Tweet permalink,https://twitter.com/__mharrison__/status/12125...,https://twitter.com/__mharrison__/status/12125...,https://twitter.com/__mharrison__/status/12126...,https://twitter.com/__mharrison__/status/12129...,https://twitter.com/__mharrison__/status/12129...,https://twitter.com/__mharrison__/status/12129...,https://twitter.com/__mharrison__/status/12129...,https://twitter.com/__mharrison__/status/12129...,https://twitter.com/__mharrison__/status/12129...,https://twitter.com/__mharrison__/status/12131...,...,https://twitter.com/__mharrison__/status/14750...,https://twitter.com/__mharrison__/status/14752...,https://twitter.com/__mharrison__/status/14752...,https://twitter.com/__mharrison__/status/14752...,https://twitter.com/__mharrison__/status/14753...,https://twitter.com/__mharrison__/status/14753...,https://twitter.com/__mharrison__/status/14755...,https://twitter.com/__mharrison__/status/14758...,https://twitter.com/__mharrison__/status/14764...,https://twitter.com/__mharrison__/status/14770...
Tweet text,Sounds like a great topic! https://t.co/f8bZbA...,@FogleBird Looks like SLC. I can see my 🏠,@afilina That's really amount and frustrating....,"@randal_olson I use anaconda when teaching, bu...",@AlSweigart Sometimes the students aren't moti...,@tmarthal @TedPetrou This. Just get blank keycaps,@gabegundy It's all downhill since the man in ...,@anthonypjshaw @brettsky @codewithanthony That...,@jet_set A little smug? 🤔😭,@juliasilge @rstudio Big news. Congrats! 👏,...,Good looking crew! https://t.co/PqUPr2qVoy,@youtah A little bit warmer in Hawaii. Just sa...,@mathsppblog Single quotes. One less key to ty...,@willmcgugan Problem is is that it covers the ...,@reuvenmlerner I don't do it on jupyter either...,@allison_horst That's awesome!,@willmcgugan You need to find out what works f...,@posco Visiting Hawaii for the holidays. Lots ...,@johndsaunders My son just built this.,@tunguz Xgboost
time,2020-01-02 03:44:00+00:00,2020-01-02 03:52:00+00:00,2020-01-02 05:56:00+00:00,2020-01-03 01:41:00+00:00,2020-01-03 02:16:00+00:00,2020-01-03 02:18:00+00:00,2020-01-03 02:18:00+00:00,2020-01-03 05:31:00+00:00,2020-01-03 05:39:00+00:00,2020-01-03 20:24:00+00:00,...,2021-12-26 08:37:00+00:00,2021-12-27 02:55:00+00:00,2021-12-27 02:56:00+00:00,2021-12-27 02:57:00+00:00,2021-12-27 02:59:00+00:00,2021-12-27 03:01:00+00:00,2021-12-27 17:25:00+00:00,2021-12-28 18:08:00+00:00,2021-12-30 07:23:00+00:00,2021-12-31 21:11:00+00:00
impressions,1465.0,154.0,1024.0,1419.0,198.0,170.0,104.0,189.0,900.0,252.0,...,22557.0,1586.0,4641.0,1969.0,1669.0,986.0,1790.0,1611.0,1354.0,5041.0
engagements,7.0,3.0,6.0,14.0,1.0,3.0,2.0,0.0,4.0,2.0,...,501.0,35.0,35.0,4.0,5.0,1.0,7.0,12.0,8.0,63.0
engagement rate,0.004778,0.019481,0.005859,0.009866,0.005051,0.017647,0.019231,0.0,0.004444,0.007937,...,0.02221,0.022068,0.007541,0.002031,0.002996,0.001014,0.003911,0.007449,0.005908,0.012498
retweets,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
replies,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,9.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0
likes,3.0,1.0,1.0,5.0,1.0,1.0,1.0,0.0,0.0,1.0,...,50.0,2.0,11.0,4.0,3.0,0.0,3.0,4.0,2.0,23.0


In [9]:
df.shape

(5791, 40)

In [10]:
df.dtypes

Tweet id                                      int64
Tweet permalink                              object
Tweet text                                   object
time                            datetime64[ns, UTC]
impressions                                 float64
engagements                                 float64
engagement rate                             float64
retweets                                    float64
replies                                     float64
likes                                       float64
user profile clicks                         float64
url clicks                                  float64
hashtag clicks                              float64
detail expands                              float64
permalink clicks                            float64
app opens                                     int64
app installs                                  int64
follows                                       int64
email tweet                                   int64
dial phone  

In [11]:
pd.options.display.max_columns

20

In [12]:
from IPython.display import display
with pd.option_context('display.max_columns', 240):
    display(df)

Unnamed: 0,Tweet id,Tweet permalink,Tweet text,time,impressions,engagements,engagement rate,retweets,replies,likes,user profile clicks,url clicks,hashtag clicks,detail expands,permalink clicks,app opens,app installs,follows,email tweet,dial phone,media views,media engagements,promoted impressions,promoted engagements,promoted engagement rate,promoted retweets,promoted replies,promoted likes,promoted user profile clicks,promoted url clicks,promoted hashtag clicks,promoted detail expands,promoted permalink clicks,promoted app opens,promoted app installs,promoted follows,promoted email tweet,promoted dial phone,promoted media views,promoted media engagements
0,1212580517905780737,https://twitter.com/__mharrison__/status/12125...,Sounds like a great topic! https://t.co/f8bZbA...,2020-01-02 03:44:00+00:00,1465.0,7.0,0.004778,0.0,0.0,3.0,3.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,
1,1212582494828036097,https://twitter.com/__mharrison__/status/12125...,@FogleBird Looks like SLC. I can see my 🏠,2020-01-02 03:52:00+00:00,154.0,3.0,0.019481,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,
2,1212613735698690049,https://twitter.com/__mharrison__/status/12126...,@afilina That's really amount and frustrating....,2020-01-02 05:56:00+00:00,1024.0,6.0,0.005859,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0.0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,
3,1212911749617242113,https://twitter.com/__mharrison__/status/12129...,"@randal_olson I use anaconda when teaching, bu...",2020-01-03 01:41:00+00:00,1419.0,14.0,0.009866,0.0,1.0,5.0,7.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,
4,1212920556028252160,https://twitter.com/__mharrison__/status/12129...,@AlSweigart Sometimes the students aren't moti...,2020-01-03 02:16:00+00:00,198.0,1.0,0.005051,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5786,1475300661851934721,https://twitter.com/__mharrison__/status/14753...,@allison_horst That's awesome!,2021-12-27 03:01:00+00:00,986.0,1.0,0.001014,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,
5787,1475518143690801156,https://twitter.com/__mharrison__/status/14755...,@willmcgugan You need to find out what works f...,2021-12-27 17:25:00+00:00,1790.0,7.0,0.003911,0.0,0.0,3.0,1.0,0.0,0.0,3.0,0.0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,
5788,1475891441243025408,https://twitter.com/__mharrison__/status/14758...,@posco Visiting Hawaii for the holidays. Lots ...,2021-12-28 18:08:00+00:00,1611.0,12.0,0.007449,0.0,0.0,4.0,4.0,0.0,0.0,4.0,0.0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,
5789,1476453819751878656,https://twitter.com/__mharrison__/status/14764...,@johndsaunders My son just built this.,2021-12-30 07:23:00+00:00,1354.0,8.0,0.005908,0.0,0.0,2.0,4.0,0.0,0.0,2.0,0.0,0,0,0,0,0,0,0,,,,,,,,,,,,,,,,,,


In [13]:
df.isna().sum()

Tweet id                           0
Tweet permalink                    0
Tweet text                         0
time                               0
impressions                        0
engagements                        0
engagement rate                    0
retweets                           0
replies                            0
likes                              0
user profile clicks                0
url clicks                         0
hashtag clicks                     0
detail expands                     0
permalink clicks                   0
app opens                          0
app installs                       0
follows                            0
email tweet                        0
dial phone                         0
media views                        0
media engagements                  0
promoted impressions            5791
promoted engagements            5791
promoted engagement rate        5791
promoted retweets               5791
promoted replies                5791
p

## Explore Exercise
* Use `.describe` to view the summary statistics
* Use `.corr` to view column correlations

In [14]:
df.describe()

Unnamed: 0,Tweet id,impressions,engagements,engagement rate,retweets,replies,likes,user profile clicks,url clicks,hashtag clicks,...,promoted hashtag clicks,promoted detail expands,promoted permalink clicks,promoted app opens,promoted app installs,promoted follows,promoted email tweet,promoted dial phone,promoted media views,promoted media engagements
count,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,1.360351e+18,2297.820411,111.400967,0.034748,0.979796,1.124504,9.400622,20.594543,4.502331,0.019686,...,,,,,,,,,,
std,6.850802e+16,16414.560844,976.353689,0.050031,10.903919,6.322059,108.117865,436.521415,32.377223,0.302481,...,,,,,,,,,,
min,1.212581e+18,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
25%,1.314214e+18,175.0,3.0,0.007062,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
50%,1.358945e+18,612.0,7.0,0.016043,0.0,0.0,1.0,1.0,0.0,0.0,...,,,,,,,,,,
75%,1.41552e+18,1614.5,25.0,0.040863,0.0,1.0,4.0,4.0,0.0,0.0,...,,,,,,,,,,
max,1.477025e+18,856749.0,45660.0,0.484127,465.0,207.0,5358.0,22393.0,1272.0,12.0,...,,,,,,,,,,


In [16]:
df.corr()

Unnamed: 0,Tweet id,impressions,engagements,engagement rate,retweets,replies,likes,user profile clicks,url clicks,hashtag clicks,...,promoted hashtag clicks,promoted detail expands,promoted permalink clicks,promoted app opens,promoted app installs,promoted follows,promoted email tweet,promoted dial phone,promoted media views,promoted media engagements
Tweet id,1.0,0.081849,0.05588,0.021168,0.061684,0.079944,0.054455,0.033756,0.038017,-0.035888,...,,,,,,,,,,
impressions,0.081849,1.0,0.93574,0.038431,0.910633,0.665746,0.973987,0.946862,0.0933,0.004081,...,,,,,,,,,,
engagements,0.05588,0.93574,1.0,0.160656,0.874747,0.648793,0.940175,0.908076,0.097863,0.001718,...,,,,,,,,,,
engagement rate,0.021168,0.038431,0.160656,1.0,0.037604,0.075533,0.048444,0.028816,0.092936,0.004714,...,,,,,,,,,,
retweets,0.061684,0.910633,0.874747,0.037604,1.0,0.607721,0.934395,0.907808,0.191076,0.005043,...,,,,,,,,,,
replies,0.079944,0.665746,0.648793,0.075533,0.607721,1.0,0.611202,0.552002,0.071265,0.011272,...,,,,,,,,,,
likes,0.054455,0.973987,0.940175,0.048444,0.934395,0.611202,1.0,0.983093,0.083502,-0.000991,...,,,,,,,,,,
user profile clicks,0.033756,0.946862,0.908076,0.028816,0.907808,0.552002,0.983093,1.0,0.016863,-0.001542,...,,,,,,,,,,
url clicks,0.038017,0.0933,0.097863,0.092936,0.191076,0.071265,0.083502,0.016863,1.0,-0.001786,...,,,,,,,,,,
hashtag clicks,-0.035888,0.004081,0.001718,0.004714,0.005043,0.011272,-0.000991,-0.001542,-0.001786,1.0,...,,,,,,,,,,


## Types

In [17]:
df.dtypes

Tweet id                                      int64
Tweet permalink                              object
Tweet text                                   object
time                            datetime64[ns, UTC]
impressions                                 float64
engagements                                 float64
engagement rate                             float64
retweets                                    float64
replies                                     float64
likes                                       float64
user profile clicks                         float64
url clicks                                  float64
hashtag clicks                              float64
detail expands                              float64
permalink clicks                            float64
app opens                                     int64
app installs                                  int64
follows                                       int64
email tweet                                   int64
dial phone  

In [18]:
df.memory_usage()

Index                             128
Tweet id                        46328
Tweet permalink                 46328
Tweet text                      46328
time                            46328
impressions                     46328
engagements                     46328
engagement rate                 46328
retweets                        46328
replies                         46328
likes                           46328
user profile clicks             46328
url clicks                      46328
hashtag clicks                  46328
detail expands                  46328
permalink clicks                46328
app opens                       46328
app installs                    46328
follows                         46328
email tweet                     46328
dial phone                      46328
media views                     46328
media engagements               46328
promoted impressions            46328
promoted engagements            46328
promoted engagement rate        46328
promoted ret

In [19]:
df.memory_usage(deep=True)

Index                               128
Tweet id                          46328
Tweet permalink                  677547
Tweet text                      1696848
time                              46328
impressions                       46328
engagements                       46328
engagement rate                   46328
retweets                          46328
replies                           46328
likes                             46328
user profile clicks               46328
url clicks                        46328
hashtag clicks                    46328
detail expands                    46328
permalink clicks                  46328
app opens                         46328
app installs                      46328
follows                           46328
email tweet                       46328
dial phone                        46328
media views                       46328
media engagements                 46328
promoted impressions              46328
promoted engagements              46328


In [20]:
df.memory_usage(deep=True).sum()

4134987

In [22]:
(df
 .select_dtypes(int).describe()
)

Unnamed: 0,Tweet id,app opens,app installs,follows,email tweet,dial phone,media views,media engagements
count,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0
mean,1.360351e+18,0.001036,0.0,0.135901,0.0,0.0,39.868935,39.712485
std,6.850802e+16,0.045513,0.0,3.870531,0.0,0.0,333.838003,333.753866
min,1.212581e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.314214e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.358945e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.41552e+18,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.477025e+18,3.0,0.0,191.0,0.0,0.0,16816.0,16816.0


In [23]:
(df
 #.select_dtypes(float)
 .select_dtypes('float64')
 .describe()
)

Unnamed: 0,impressions,engagements,engagement rate,retweets,replies,likes,user profile clicks,url clicks,hashtag clicks,detail expands,...,promoted hashtag clicks,promoted detail expands,promoted permalink clicks,promoted app opens,promoted app installs,promoted follows,promoted email tweet,promoted dial phone,promoted media views,promoted media engagements
count,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,2297.820411,111.400967,0.034748,0.979796,1.124504,9.400622,20.594543,4.502331,0.019686,34.658436,...,,,,,,,,,,
std,16414.560844,976.353689,0.050031,10.903919,6.322059,108.117865,436.521415,32.377223,0.302481,355.671163,...,,,,,,,,,,
min,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
25%,175.0,3.0,0.007062,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,,,,,,,,,,
50%,612.0,7.0,0.016043,0.0,0.0,1.0,1.0,0.0,0.0,3.0,...,,,,,,,,,,
75%,1614.5,25.0,0.040863,0.0,1.0,4.0,4.0,0.0,0.0,8.0,...,,,,,,,,,,
max,856749.0,45660.0,0.484127,465.0,207.0,5358.0,22393.0,1272.0,12.0,17078.0,...,,,,,,,,,,


In [24]:
(df
 .assign(impressions=df.impressions.astype(int),
         engagements=df.engagements.astype(int)
         # lots of this here
        )
)

Unnamed: 0,Tweet id,Tweet permalink,Tweet text,time,impressions,engagements,engagement rate,retweets,replies,likes,...,promoted hashtag clicks,promoted detail expands,promoted permalink clicks,promoted app opens,promoted app installs,promoted follows,promoted email tweet,promoted dial phone,promoted media views,promoted media engagements
0,1212580517905780737,https://twitter.com/__mharrison__/status/12125...,Sounds like a great topic! https://t.co/f8bZbA...,2020-01-02 03:44:00+00:00,1465,7,0.004778,0.0,0.0,3.0,...,,,,,,,,,,
1,1212582494828036097,https://twitter.com/__mharrison__/status/12125...,@FogleBird Looks like SLC. I can see my 🏠,2020-01-02 03:52:00+00:00,154,3,0.019481,0.0,0.0,1.0,...,,,,,,,,,,
2,1212613735698690049,https://twitter.com/__mharrison__/status/12126...,@afilina That's really amount and frustrating....,2020-01-02 05:56:00+00:00,1024,6,0.005859,0.0,0.0,1.0,...,,,,,,,,,,
3,1212911749617242113,https://twitter.com/__mharrison__/status/12129...,"@randal_olson I use anaconda when teaching, bu...",2020-01-03 01:41:00+00:00,1419,14,0.009866,0.0,1.0,5.0,...,,,,,,,,,,
4,1212920556028252160,https://twitter.com/__mharrison__/status/12129...,@AlSweigart Sometimes the students aren't moti...,2020-01-03 02:16:00+00:00,198,1,0.005051,0.0,0.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5786,1475300661851934721,https://twitter.com/__mharrison__/status/14753...,@allison_horst That's awesome!,2021-12-27 03:01:00+00:00,986,1,0.001014,0.0,0.0,0.0,...,,,,,,,,,,
5787,1475518143690801156,https://twitter.com/__mharrison__/status/14755...,@willmcgugan You need to find out what works f...,2021-12-27 17:25:00+00:00,1790,7,0.003911,0.0,0.0,3.0,...,,,,,,,,,,
5788,1475891441243025408,https://twitter.com/__mharrison__/status/14758...,@posco Visiting Hawaii for the holidays. Lots ...,2021-12-28 18:08:00+00:00,1611,12,0.007449,0.0,0.0,4.0,...,,,,,,,,,,
5789,1476453819751878656,https://twitter.com/__mharrison__/status/14764...,@johndsaunders My son just built this.,2021-12-30 07:23:00+00:00,1354,8,0.005908,0.0,0.0,2.0,...,,,,,,,,,,


In [25]:
(df
 .impressions
 .astype(int))

0       1465
1        154
2       1024
3       1419
4        198
        ... 
5786     986
5787    1790
5788    1611
5789    1354
5790    5041
Name: impressions, Length: 5791, dtype: int32

In [26]:
df.assign?

[1;31mSignature:[0m [0mdf[0m[1;33m.[0m[0massign[0m[1;33m([0m[1;33m**[0m[0mkwargs[0m[1;33m)[0m [1;33m->[0m [1;34m'DataFrame'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Assign new columns to a DataFrame.

Returns a new object with all original columns in addition to new ones.
Existing columns that are re-assigned will be overwritten.

Parameters
----------
**kwargs : dict of {str: callable or Series}
    The column names are keywords. If the values are
    callable, they are computed on the DataFrame and
    assigned to the new columns. The callable must not
    change input DataFrame (though pandas doesn't check it).
    If the values are not callable, (e.g. a Series, scalar, or array),
    they are simply assigned.

Returns
-------
DataFrame
    A new DataFrame with the new columns in addition to
    all the existing columns.

Notes
-----
Assigning multiple columns within the same ``assign`` is possible.
Later items in '\*\*kwargs' may refer to newly created

In [27]:
# also note
(df
 .assign(impressions=df.impressions.astype(int),
         engagement rate=df.engagements rate.astype(int)
         # lots of this here
        )
)

SyntaxError: invalid syntax (3555764995.py, line 4)

In [28]:
# fix names
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
)

Unnamed: 0,Tweet_id,Tweet_permalink,Tweet_text,time,impressions,engagements,engagement_rate,retweets,replies,likes,...,promoted_hashtag_clicks,promoted_detail_expands,promoted_permalink_clicks,promoted_app_opens,promoted_app_installs,promoted_follows,promoted_email_tweet,promoted_dial_phone,promoted_media_views,promoted_media_engagements
0,1212580517905780737,https://twitter.com/__mharrison__/status/12125...,Sounds like a great topic! https://t.co/f8bZbA...,2020-01-02 03:44:00+00:00,1465.0,7.0,0.004778,0.0,0.0,3.0,...,,,,,,,,,,
1,1212582494828036097,https://twitter.com/__mharrison__/status/12125...,@FogleBird Looks like SLC. I can see my 🏠,2020-01-02 03:52:00+00:00,154.0,3.0,0.019481,0.0,0.0,1.0,...,,,,,,,,,,
2,1212613735698690049,https://twitter.com/__mharrison__/status/12126...,@afilina That's really amount and frustrating....,2020-01-02 05:56:00+00:00,1024.0,6.0,0.005859,0.0,0.0,1.0,...,,,,,,,,,,
3,1212911749617242113,https://twitter.com/__mharrison__/status/12129...,"@randal_olson I use anaconda when teaching, bu...",2020-01-03 01:41:00+00:00,1419.0,14.0,0.009866,0.0,1.0,5.0,...,,,,,,,,,,
4,1212920556028252160,https://twitter.com/__mharrison__/status/12129...,@AlSweigart Sometimes the students aren't moti...,2020-01-03 02:16:00+00:00,198.0,1.0,0.005051,0.0,0.0,1.0,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5786,1475300661851934721,https://twitter.com/__mharrison__/status/14753...,@allison_horst That's awesome!,2021-12-27 03:01:00+00:00,986.0,1.0,0.001014,0.0,0.0,0.0,...,,,,,,,,,,
5787,1475518143690801156,https://twitter.com/__mharrison__/status/14755...,@willmcgugan You need to find out what works f...,2021-12-27 17:25:00+00:00,1790.0,7.0,0.003911,0.0,0.0,3.0,...,,,,,,,,,,
5788,1475891441243025408,https://twitter.com/__mharrison__/status/14758...,@posco Visiting Hawaii for the holidays. Lots ...,2021-12-28 18:08:00+00:00,1611.0,12.0,0.007449,0.0,0.0,4.0,...,,,,,,,,,,
5789,1476453819751878656,https://twitter.com/__mharrison__/status/14764...,@johndsaunders My son just built this.,2021-12-30 07:23:00+00:00,1354.0,8.0,0.005908,0.0,0.0,2.0,...,,,,,,,,,,


In [29]:
df.filter(regex=r'promoted')

Unnamed: 0,promoted impressions,promoted engagements,promoted engagement rate,promoted retweets,promoted replies,promoted likes,promoted user profile clicks,promoted url clicks,promoted hashtag clicks,promoted detail expands,promoted permalink clicks,promoted app opens,promoted app installs,promoted follows,promoted email tweet,promoted dial phone,promoted media views,promoted media engagements
0,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5786,,,,,,,,,,,,,,,,,,
5787,,,,,,,,,,,,,,,,,,
5788,,,,,,,,,,,,,,,,,,
5789,,,,,,,,,,,,,,,,,,


In [30]:
(df
 .drop(columns=[c for c in df.columns if 'promoted' in c])
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .describe()
)

Unnamed: 0,Tweet_id,impressions,engagements,engagement_rate,retweets,replies,likes,user_profile_clicks,url_clicks,hashtag_clicks,detail_expands,permalink_clicks,app_opens,app_installs,follows,email_tweet,dial_phone,media_views,media_engagements
count,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0,5791.0
mean,1.360351e+18,2297.820411,111.400967,0.034748,0.979796,1.124504,9.400622,20.594543,4.502331,0.019686,34.658436,0.0,0.001036,0.0,0.135901,0.0,0.0,39.868935,39.712485
std,6.850802e+16,16414.560844,976.353689,0.050031,10.903919,6.322059,108.117865,436.521415,32.377223,0.302481,355.671163,0.0,0.045513,0.0,3.870531,0.0,0.0,333.838003,333.753866
min,1.212581e+18,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.314214e+18,175.0,3.0,0.007062,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1.358945e+18,612.0,7.0,0.016043,0.0,0.0,1.0,1.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.41552e+18,1614.5,25.0,0.040863,0.0,1.0,4.0,4.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.477025e+18,856749.0,45660.0,0.484127,465.0,207.0,5358.0,22393.0,1272.0,12.0,17078.0,0.0,3.0,0.0,191.0,0.0,0.0,16816.0,16816.0


In [31]:
# be careful with renaming
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .drop(columns=[c for c in df.columns if 'promoted' in c])
)

KeyError: "['promoted impressions', 'promoted engagements', 'promoted engagement rate', 'promoted retweets', 'promoted replies', 'promoted likes', 'promoted user profile clicks', 'promoted url clicks', 'promoted hashtag clicks', 'promoted detail expands', 'promoted permalink clicks', 'promoted app opens', 'promoted app installs', 'promoted follows', 'promoted email tweet', 'promoted dial phone', 'promoted media views', 'promoted media engagements'] not found in axis"

In [32]:
df.drop?

[1;31mSignature:[0m
[0mdf[0m[1;33m.[0m[0mdrop[0m[1;33m([0m[1;33m
[0m    [0mlabels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0maxis[0m[1;33m:[0m [1;34m'Axis'[0m [1;33m=[0m [1;36m0[0m[1;33m,[0m[1;33m
[0m    [0mindex[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mcolumns[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mlevel[0m[1;33m:[0m [1;34m'Level | None'[0m [1;33m=[0m [1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0minplace[0m[1;33m:[0m [1;34m'bool'[0m [1;33m=[0m [1;32mFalse[0m[1;33m,[0m[1;33m
[0m    [0merrors[0m[1;33m:[0m [1;34m'str'[0m [1;33m=[0m [1;34m'raise'[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Drop specified labels from rows or columns.

Remove rows or columns by specifying label names and corresponding
axis, or by specifying directly index or column names. When using a
multi-index, labels on different levels can be removed by spec

In [33]:
def drop_col(df_, pattern):
     return df_.drop(columns=[c for c in df_.columns if pattern in c])

(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 #.pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .pipe(drop_col, pattern='promoted')
 .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
)

Unnamed: 0,Tweet_permalink,Tweet_text,time,impressions,engagements,engagement_rate,retweets,replies,likes,user_profile_clicks,url_clicks,hashtag_clicks,detail_expands,follows,media_views,media_engagements
0,https://twitter.com/__mharrison__/status/12125...,Sounds like a great topic! https://t.co/f8bZbA...,2020-01-02 03:44:00+00:00,1465.0,7.0,0.004778,0.0,0.0,3.0,3.0,0.0,0.0,1.0,0,0,0
1,https://twitter.com/__mharrison__/status/12125...,@FogleBird Looks like SLC. I can see my 🏠,2020-01-02 03:52:00+00:00,154.0,3.0,0.019481,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0,0,0
2,https://twitter.com/__mharrison__/status/12126...,@afilina That's really amount and frustrating....,2020-01-02 05:56:00+00:00,1024.0,6.0,0.005859,0.0,0.0,1.0,2.0,0.0,0.0,3.0,0,0,0
3,https://twitter.com/__mharrison__/status/12129...,"@randal_olson I use anaconda when teaching, bu...",2020-01-03 01:41:00+00:00,1419.0,14.0,0.009866,0.0,1.0,5.0,7.0,0.0,0.0,1.0,0,0,0
4,https://twitter.com/__mharrison__/status/12129...,@AlSweigart Sometimes the students aren't moti...,2020-01-03 02:16:00+00:00,198.0,1.0,0.005051,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5786,https://twitter.com/__mharrison__/status/14753...,@allison_horst That's awesome!,2021-12-27 03:01:00+00:00,986.0,1.0,0.001014,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0,0
5787,https://twitter.com/__mharrison__/status/14755...,@willmcgugan You need to find out what works f...,2021-12-27 17:25:00+00:00,1790.0,7.0,0.003911,0.0,0.0,3.0,1.0,0.0,0.0,3.0,0,0,0
5788,https://twitter.com/__mharrison__/status/14758...,@posco Visiting Hawaii for the holidays. Lots ...,2021-12-28 18:08:00+00:00,1611.0,12.0,0.007449,0.0,0.0,4.0,4.0,0.0,0.0,4.0,0,0,0
5789,https://twitter.com/__mharrison__/status/14764...,@johndsaunders My son just built this.,2021-12-30 07:23:00+00:00,1354.0,8.0,0.005908,0.0,0.0,2.0,4.0,0.0,0.0,2.0,0,0,0


In [34]:

(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .memory_usage(deep=True)
 .sum()  # 3 megs
)

3023115

In [35]:
df.pipe?

[1;31mSignature:[0m
[0mdf[0m[1;33m.[0m[0mpipe[0m[1;33m([0m[1;33m
[0m    [0mfunc[0m[1;33m:[0m [1;34m'Callable[..., T] | tuple[Callable[..., T], str]'[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[0margs[0m[1;33m,[0m[1;33m
[0m    [1;33m**[0m[0mkwargs[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m [1;33m->[0m [1;34m'T'[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Apply chainable functions that expect Series or DataFrames.

Parameters
----------
func : function
    Function to apply to the Series/DataFrame.
    ``args``, and ``kwargs`` are passed into ``func``.
    Alternatively a ``(callable, data_keyword)`` tuple where
    ``data_keyword`` is a string indicating the keyword of
    ``callable`` that expects the Series/DataFrame.
args : iterable, optional
    Positional arguments passed into ``func``.
kwargs : mapping, optional
    A dictionary of keyword arguments passed into ``func``.

Returns
-------
object : the return type of ``func``.

See Also
--------
Dat

## Column Cleanup Exercise
(Please don't mutate here!)

* Use `.loc` to select the *impressions* and *engagement* columns
* Use `.drop` to select the *impressions* and *engagement* columns
* Use `.rename` to rename *impressions* to *imp* and *engagement* to *eng*

## Ok, Types for real

In [None]:

(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .describe()
)

In [None]:
np.iinfo('int64')

In [None]:
for size in ['uint8', 'uint16', 'uint32', 'int8', 'int16', 'int32', 'int64']:
    print(f'{size=} {np.iinfo(size)}')

In [None]:

(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
        )
 .describe()
)

In [None]:
kwargs = {}
for col in df.select_dtypes(float).columns:
    print(col)
    kwargs[col] = df[col].astype(int)
kwargs

In [None]:
# use dict comp if you don't want to type every column
# assign w/ dict comp. and lambda
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']}  # less than 255
        )
)

In [None]:
# why c=c?
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         **{c:lambda df_:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
         **{c:lambda df_:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                          'detail_expands', 'media_views', 'media_engagements']}  # less than 65,535
        )
 #.corr()
 .describe()
)

In [None]:
# https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result
squares = []
for x in range(5):
    squares.append(lambda: x**2)
for s in squares:
    print(s())

In [None]:
# https://docs.python.org/3/faq/programming.html#why-do-lambdas-defined-in-a-loop-with-different-values-all-return-the-same-result
squares = []
for x in range(5):
    squares.append(lambda x=x: x**2)
for s in squares:
    print(s())

In [None]:
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
         **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                          'detail_expands', 'media_views', 'media_engagements']}  # less than 65,535
        )
 .describe()
)

In [None]:
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
         **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                          'detail_expands', 'media_views', 'media_engagements']}  # less than 65,535
         
        )
 .memory_usage(deep=True) 
 .sum()  # was 3 megs
)

In [None]:
# most is from text
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['Tweet_id', 'permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
         **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                          'detail_expands', 'media_views', 'media_engagements']}  # less than 65,535
         
        )
 .memory_usage(deep=True) 
 .pipe(lambda ser: ser/ser.sum()*100)
# .sum()  # was 3 megs
)

In [None]:
# convert first part of permalink to category and add back tweet_id
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
         **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                          'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
         Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                               index=df_.index),
        )
 .memory_usage(deep=True) 
 .sum()  # was 3 megs
)

In [None]:
# convert first part of permalink to category and add back tweet_id
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
         **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                          'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
         Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                               index=df_.index),
        )
 .describe()
 #.memory_usage(deep=True) 
 #.sum()  # was 3 megs
)

## Alternate Integer Conversion Exercise
(Again, no mutation!)

* Use `.select_dtypes` to filter all `int` columns from `df`
* Use `.astype` with above to convert all columns to `uint8`
* Use `.assign` with above to create new dataframe with updated integer columns

## Other Types
Can apply similar logic to floats, and strings.

Converting "Tweet_text" to a category doesn't make sense because it is high cardinality

In [None]:
# Uses MORE memory if tweet text is a category!
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
         **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                          'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
         Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                               index=df_.index),
         Tweet_text=lambda df_:df_.Tweet_text.astype('category')
        )
 .memory_usage(deep=True) 
 .sum()  # was 3 megs
)

## Other types Exercise
* Use the `%%timeit` cell magic to see how long it takes to run `.str.lower()` on the original *Tweet permalink* column
* Create a new dataframe, `df2`, with our current chain
* Use the `%%timeit` cell magic to see how long it takes to run `.str.lower()` on the *df2.Tweet_permalink* column

## Dates

In [None]:
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
         **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                          'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
         Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                               index=df_.index),
        )
 .time
)

In [None]:
# Convert to Local Time (already in UTC)
(df
 .rename(columns=lambda col_name: col_name.replace(' ', '_'))
 .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
 .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
 .astype({c:'uint8' for c in ['replies', 'hashtag_clicks', 'follows']})  # less than 255)
 .assign(impressions=df.impressions.astype('uint32'),
         engagements=df.engagements.astype('uint16'),
         #**{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
         **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                          'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
         Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                               index=df_.index),
         time=lambda df_: df_.time.dt.tz_convert('America/Denver')
        )
 .time
)

## Dates Exercise
* Create a series with the months of the *time* column
* Convert the *time* column to UTC
* Convert the *time* column to `America/New_York`

## Chain

Chaining is also called "flow" programming. Rather than making intermediate variables, just leverage the fact that most operations return a new object and work on that.

The chain should read like a recipe of ordered steps.

(BTW, this is actually what we did above.)

<div class='alert alert-warning'>
    Hint: Leverage <tt>.pipe</tt> if you can't find a way to chain 😉🐼💪
</div>

In [None]:
# convert to a function
def tweak_twitter(df):
    return (df
     .rename(columns=lambda col_name: col_name.replace(' ', '_'))
     .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
     .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
     .assign(impressions=df.impressions.astype('uint32'),
             engagements=df.engagements.astype('uint16'),
             **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
             **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                              'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
             Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                                   index=df_.index),
             time=lambda df_: df_.time.dt.tz_convert('America/Denver')
            )
    )

In [None]:
# I would want my notebook to start off like this:
import glob

import numpy as np
import pandas as pd

data = [pd.read_csv(f, parse_dates=['time'], na_values='-') for f in glob.glob('tweet_activity_metrics___mharrison___*')]
df = pd.concat(data, ignore_index=True).sort_values('time')

In [None]:
def tweak_twitter(df):
    return (df
     .rename(columns=lambda col_name: col_name.replace(' ', '_'))
     .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
     .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
     .assign(impressions=df.impressions.astype('uint32'),
             engagements=df.engagements.astype('uint16'),
             **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
             **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                              'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
             Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                                   index=df_.index),
             time=lambda df_: df_.time.dt.tz_convert('America/Denver')
            )
    )
twit_df = tweak_twitter(df)

In [None]:
# compare with non-chain
df1 = df.rename(columns=lambda col_name: col_name.replace(' ', '_'))
keep = [c for c in df1.columns if 'promoted' not in c]
df2 = df1[keep]
keep2 = [c for c in df2 if c not in ['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone']]
df3 = df2[keep2]
imps = df3.impressions.astype('uint32')
df3.impressions = imps
eng = df3.engagements.astype('uint16')
df3['engagements'] = eng
df3['replies'] = df3.replies.astype('uint8')
df3['hashtag_clicks'] = df3.hashtag_clicks.astype('uint8')

In [None]:
# easy to debug
#  - assign to var (renamed_df)
#  - comment out
#  - pipe to display

from IPython.display import display

def get_var(df, var_name):
    globals()[var_name] = df
    return df

def tweak_twitter(df):
    return (df
     .rename(columns=lambda col_name: col_name.replace(' ', '_'))
     .pipe(get_var, 'renamed_df')
     .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
     .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
     .pipe(lambda df_:display(df_) or df_)
     .assign(impressions=df.impressions.astype('uint32'),
             engagements=df.engagements.astype('uint16'),
             **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
             **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                            'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
             Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                                   index=df_.index),
            time=lambda df_: df_.time.dt.tz_convert('America/Denver')
            )
    )
twit_df = tweak_twitter(df)

In [None]:
renamed_df

In [None]:
def tweak_twitter(df):
    return (df
     .rename(columns=lambda col_name: col_name.replace(' ', '_'))
     .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
     .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
     .assign(impressions=df.impressions.astype('uint32'),
             engagements=df.engagements.astype('uint16'),
             **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
             **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                              'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
             Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                                   index=df_.index),
             time=lambda df_: df_.time.dt.tz_convert('America/Denver')
            )
    )
twit_df = tweak_twitter(df)

## Chain Exercise
* Use `.pipe` to print the shape of the dataframe after every step in the chain of the `tweak_twitter` function

## Don't Mutate

> "you are missing the point, inplace rarely actually does something inplace, you are thinking that you are saving memory but you are not."
>
> **jreback** - Pandas core dev



https://github.com/pandas-dev/pandas/issues/16529#issuecomment-676518136

* In general, no performance benefits
* Prohibits chaining
* ``SettingWithCopyWarning`` fun

## Don't Apply (if you can)

In [None]:
def tweak_twitter(df):
    return (df
     .rename(columns=lambda col_name: col_name.replace(' ', '_'))
     .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
     .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
     .assign(impressions=df.impressions.astype('uint32'),
             engagements=df.engagements.astype('uint16'),
             **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
             **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                              'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
             Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                                   index=df_.index),
             time=lambda df_: df_.time.dt.tz_convert('America/Denver')
            )
    )
twit_df = tweak_twitter(df)

In [None]:
twit_df

In [None]:
def to_percent(val):
    return val * 100
twit_df.engagement_rate.apply(to_percent)

In [None]:
# same result
twit_df.engagement_rate * 100

In [None]:
%%timeit
# however ...
twit_df.engagement_rate.apply(to_percent)

In [None]:
%%timeit
twit_df.engagement_rate * 100

In [None]:
# 14X slower!
1008 / 71

In [None]:
# How would we check if text had unicode?
'Hello \U0001f600'.encode('ascii', errors='replace').decode('ascii')

In [None]:
'Hello \U0001f600'.encode('utf8', errors='replace').decode('utf8')

In [None]:
# story is a little different with text

def is_unicode(val):
    return val.encode('ascii', errors='replace').decode('ascii') != val

In [None]:
%lsmagic

In [None]:
%%timeit?

In [None]:
%%timeit
twit_df.Tweet_text.apply(is_unicode)

In [None]:
%%timeit
twit_df.Tweet_text.str.encode('ascii', errors='replace').str.decode('ascii') == twit_df.Tweet_text

In [None]:
%%timeit
twit_df.Tweet_text.str.startswith('@')

In [None]:
def startswith_at(txt):
    return txt.startswith('@')

In [None]:
%%timeit
twit_df.Tweet_text.apply(startswith_at)

In [None]:
def tweak_twitter(df):
    return (df
     .rename(columns=lambda col_name: col_name.replace(' ', '_'))
     .pipe(lambda df_: df_.drop(columns=[c for c in df_.columns if 'promoted' in c]))
     .drop(columns=['permalink_clicks', 'app_opens', 'app_installs', 'email_tweet', 'dial_phone'])
     .assign(impressions=df.impressions.astype('uint32'),
             engagements=df.engagements.astype('uint16'),
             **{c:lambda df_, c=c:df_[c].astype('uint8') for c in ['replies', 'hashtag_clicks', 'follows']},  # less than 255
             **{c:lambda df_, c=c:df_[c].astype('uint16') for c in ['retweets', 'likes', 'user_profile_clicks', 'url_clicks', 
                                              'detail_expands', 'media_views', 'media_engagements']},  # less than 65,535
             Tweet_permalink=lambda df_: pd.Series('https://twitter.com/__mharrison__/status/', dtype='category', 
                                                   index=df_.index),
             time=lambda df_: df_.time.dt.tz_convert('America/Denver'),
             is_reply=lambda df_: df_.Tweet_text.str.startswith('@'),
             length=lambda df_:df_.Tweet_text.str.len(),
             num_words=lambda df_:df_.Tweet_text.str.split().apply(len),
             is_unicode=lambda df_:df_.Tweet_text.str.encode('ascii', errors='replace').str.decode('ascii') != df_.Tweet_text,
             hour=lambda df_:df_.time.dt.hour,
             dom=lambda df_:df_.time.dt.day,  #day of month
             dow=lambda df_:df_.time.dt.dayofweek,  #day of week
             at_tweet=lambda df_:df_.Tweet_text.str.contains('@'),
             has_newlines=lambda df_:df_.Tweet_text.str.contains('\n'),
             num_lines=lambda df_:df_.Tweet_text.str.count('\n'),
             num_mentions=lambda df_:df_.Tweet_text.str.count('@'),
             has_hashtag=lambda df_:df_.Tweet_text.str.count('#'),
            )
    )
twit_df = tweak_twitter(df)

In [None]:
twit_df

## Apply Exercise
* Calculate engagement ratio by dividing *engagements* by *impressions*
* Calculate engagement ratio 2 by dividing the sum of *replies*, *retweets*, *likes*, *user_profile_clicks*, and *detail_expands* by *impressions*

## Master Aggregation

In [None]:
(twit_df
 .groupby(twit_df.time.dt.year)
 .mean()
)

In [None]:
twit_df.groupby(twit_df.time.dt.year).mean()

In [None]:
(twit_df
 .groupby(twit_df.time.dt.year)
 .impressions
 .mean()
)

In [None]:
%%timeit
(twit_df
 .groupby(twit_df.time.dt.year)
 .mean()
 [['impressions', 'replies']]  # index operation with a list inside 
)

In [None]:
%%timeit
(twit_df
 .groupby(twit_df.time.dt.year)
 [['impressions', 'replies']]  # index operation with a list inside 
  .mean()
)

In [None]:
twit_df.Tweet_text.str.

In [None]:
twit_df.time.dt.year.rename('year')

In [None]:
pd.options.display.float_format

In [None]:
(twit_df
 .groupby([twit_df.time.dt.year.rename('year'), twit_df.time.dt.month.rename('month')])
 [['impressions', 'replies']]
 .mean()
 #.round(2)
 .style
 .format({'replies': '{:.3f}', 'impressions': '{:e}'})
 
)

In [None]:
(twit_df
 .groupby([twit_df.time.dt.year, twit_df.time.dt.month])
 [['impressions', 'replies']]
 #.mean()
 .median()
 .plot()
)

In [None]:
(twit_df
 #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])
 .groupby(pd.Grouper(key='time', freq='2M'))
 [['impressions', 'replies']]
 #.mean()
 .median()
 .plot()
)

In [None]:
(twit_df
 #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])
 .groupby(pd.Grouper(key='time', freq='2w'))
 [['impressions', 'replies']]
 .mean()
 .plot()
)

In [None]:
(twit_df
 #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])
 .groupby(pd.Grouper(key='time', freq='7d5h'))
 [['impressions', 'replies']]
 .mean()
 #.plot()
)

In [None]:
(twit_df
 #.groupby([twit_df.time.dt.year, twit_df.time.dt.month])
 .groupby([pd.Grouper(key='time', freq='7d5h'), 'is_unicode'])
 [['impressions', 'replies']]
 .mean()
 #.plot()
)

In [None]:
# multiple aggregates
def second_to_last(ser):
    try:
        return ser.iloc[-2]
    except IndexError:
        return 0

(twit_df
 .groupby([pd.Grouper(key='time', freq='7d5h'), 'is_unicode'])
 [['impressions', 'replies']]
 .agg(['mean', 'median', second_to_last])
)

In [None]:
# multiple aggregates

(twit_df
 .groupby([pd.Grouper(key='time', freq='7d5h'), 'is_unicode'])
 [['impressions', 'replies']]
 .agg(['mean', 'median', second_to_last])
 .plot()
)

In [None]:
# multiple aggregates

(twit_df
 .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])
 [['impressions', 'replies']]
 .agg(['mean', 'median', second_to_last])
 .unstack()
)

In [None]:
# multiple aggregates

(twit_df
 .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])
 [['impressions', 'replies']]
 .agg(['mean', 'median', second_to_last])
 .unstack()
 .impressions
)

In [None]:
# multiple aggregates
(twit_df
 .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])
 [['impressions', 'replies']]
 .agg(['mean', 'median', second_to_last])
 .unstack()
 .impressions
 ['mean']  # note have to use index syntax here
)

In [None]:
# multiple aggregates
(twit_df
 .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])
 [['impressions', 'replies']]
 .agg(['mean', 'median', second_to_last])
 .unstack()
 .impressions
 .mean  # note have to use index syntax here
)

In [None]:
# multiple aggregates
(twit_df
 .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])
 [['impressions', 'replies']]
 .agg(['mean', 'median', second_to_last])
 .unstack()
 .impressions
 ['mean']
 .plot()
)

In [None]:
# multiple aggregates
# dealing with missing values
(twit_df
 .groupby([pd.Grouper(key='time', freq='7d'), 'is_unicode'])
 [['impressions', 'replies']]
 .agg(['mean', 'median', second_to_last])
 .unstack()
 .impressions
 ['mean']
 #.fillna(0)
 #.interpolate()
 #.bfill()
 #.dropna()
 .loc['2021/07':'2021/08']
 #.plot()
)

In [None]:
# multiple aggregates
(twit_df
 .groupby([pd.Grouper(key='time', freq='3d'), 'is_unicode'])
 [['impressions', 'replies']]
 .agg(['mean', 'median', second_to_last])
 .unstack()
 .impressions
 ['mean']
 .interpolate()
 .rolling(7)
 .mean()
 .plot()
)

In [None]:
# named aggregation

(twit_df
 .groupby([pd.Grouper(key='time', freq='M'), 'is_unicode'])
 .agg(total_views=('impressions', 'sum'),
     mean_views=('impressions', 'mean'),
     profile_clicks=('user_profile_clicks', lambda ser: ser.sum()))
)

In [None]:
# named aggregation - fails with resample

(twit_df
 #.groupby([pd.Grouper(key='time', freq='M'), 'is_unicode'])
 .set_index('time')
 .resample('M')
 .agg(total_views=('impressions', 'sum'),
     mean_views=('impressions', 'mean'),
     profile_clicks=('user_profile_clicks', lambda ser: ser.sum()))
)

In [None]:
# named aggregation

(twit_df
 .groupby([pd.Grouper(key='time', freq='M'), 'is_unicode'])
 .agg(total_views=('impressions', 'sum'),
     mean_views=('impressions', 'mean'),
     profile_clicks=('user_profile_clicks', lambda ser: ser.sum()))
 .unstack()
 .profile_clicks
 .plot()
)

## Aggregation Exercise
* What were the total impressions for each year?
* What were the total impressions for each month?
* Plot the previous
* What were the total impressions for unicode and non-unicode tweets for each month?
* Plot the previous
* What were the total impressions for reply and non-reply tweets for each month?
* Plot the previous

## Summary

* Correct types save space and enable convenient math, string, and date functionality
* Chaining operations will:
   * Make code readable
   * Remove bugs
   * Easier to debug
* Don't mutate (there's no point). Embrace chaining.
* ``.apply`` is slow for math
* Aggregations are powerful. Play with them until they make sense

Connect with me on LinkedIn or Twitter (@\_\_mharrison\_\_)