In [1]:
import pandas as pd

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [3]:
crime_sort = pd.read_hdf('data/crime.h5', 'crime') \
                .set_index('REPORTED_DATE') \
                .sort_index()

In [4]:
type(crime_sort.index)

pandas.core.indexes.datetimes.DatetimeIndex

#### checking methods for Timestamp index

In [5]:
common_attrs = set(dir(crime_sort.index)) & set(dir(pd.Timestamp))

In [6]:
print([attr for attr in common_attrs if attr[0] != '_'])

['date', 'timetz', 'dayofyear', 'quarter', 'freq', 'microsecond', 'to_julian_date', 'round', 'time', 'month_name', 'is_quarter_start', 'nanosecond', 'to_pydatetime', 'week', 'day', 'second', 'tzinfo', 'days_in_month', 'month', 'is_month_start', 'to_numpy', 'weekofyear', 'normalize', 'tz', 'is_quarter_end', 'year', 'dayofweek', 'ceil', 'resolution', 'tz_localize', 'day_of_year', 'to_period', 'tz_convert', 'floor', 'hour', 'weekday', 'minute', 'is_year_start', 'max', 'min', 'day_of_week', 'day_name', 'daysinmonth', 'is_leap_year', 'isocalendar', 'strftime', 'freqstr', 'is_year_end', 'is_month_end']


In [7]:
crime_sort.index.day_name().value_counts()

Monday       70024
Friday       69621
Wednesday    69538
Thursday     69287
Tuesday      68394
Saturday     58834
Sunday       55213
Name: REPORTED_DATE, dtype: int64

#### using the timestamp index methods for the pandas groupby, thereby grouping on weeks, days etc...

In [8]:
cr_grp_by_wk = crime_sort.groupby(lambda x: x.day_name())

In [9]:
cr_grp_by_wk['IS_CRIME', 'IS_TRAFFIC'].sum()

Unnamed: 0,IS_CRIME,IS_TRAFFIC
Friday,48833,20814
Monday,52158,17895
Saturday,43363,15516
Sunday,42315,12968
Thursday,49470,19845
Tuesday,49658,18755
Wednesday,50054,19508


#### groupby with more than one function

In [10]:
funcs = [lambda x: x.round('2h').hour, lambda x: x.year]

In [11]:
cr_grp_by_2h_and_year = crime_sort.groupby(funcs)

In [12]:
cr_grp_by_2h_and_year['IS_CRIME', 'IS_TRAFFIC'].sum()

Unnamed: 0,Unnamed: 1,IS_CRIME,IS_TRAFFIC
0,2012,2422,919
0,2013,4040,792
0,2014,5649,978
0,2015,5649,1136
0,2016,5377,980
...,...,...,...
22,2013,4318,1330
22,2014,5496,1532
22,2015,5626,1671
22,2016,5637,1472


In [13]:
cr_grp_by_2h_and_year = cr_grp_by_2h_and_year['IS_CRIME', 'IS_TRAFFIC'].sum().unstack()

In [14]:
cr_grp_by_2h_and_year.head(4)

Unnamed: 0_level_0,IS_CRIME,IS_CRIME,IS_CRIME,IS_CRIME,IS_CRIME,IS_CRIME,IS_TRAFFIC,IS_TRAFFIC,IS_TRAFFIC,IS_TRAFFIC,IS_TRAFFIC,IS_TRAFFIC
Unnamed: 0_level_1,2012,2013,2014,2015,2016,2017,2012,2013,2014,2015,2016,2017
0,2422,4040,5649,5649,5377,3811,919,792,978,1136,980,782
2,1888,3214,4245,4050,4091,3041,718,652,779,773,718,537
4,1472,2181,2956,2959,3044,2255,399,378,424,471,464,313
6,1067,1365,1750,2167,2108,1567,411,399,479,494,593,462


#### highlighting the maximum crimes at hours of the day (24 hour format)

In [15]:
cr_grp_by_2h_and_year.style.highlight_max(color='red')

Unnamed: 0_level_0,IS_CRIME,IS_CRIME,IS_CRIME,IS_CRIME,IS_CRIME,IS_CRIME,IS_TRAFFIC,IS_TRAFFIC,IS_TRAFFIC,IS_TRAFFIC,IS_TRAFFIC,IS_TRAFFIC
Unnamed: 0_level_1,2012,2013,2014,2015,2016,2017,2012,2013,2014,2015,2016,2017
0,2422,4040,5649,5649,5377,3811,919,792,978,1136,980,782
2,1888,3214,4245,4050,4091,3041,718,652,779,773,718,537
4,1472,2181,2956,2959,3044,2255,399,378,424,471,464,313
6,1067,1365,1750,2167,2108,1567,411,399,479,494,593,462
8,2998,3445,3727,4161,4488,3251,1957,1955,2210,2331,2372,1828
10,4305,5035,5658,6205,6218,4993,1979,1901,2139,2320,2303,1873
12,4496,5524,6434,6841,7226,5463,2200,2138,2379,2631,2760,1986
14,4266,5698,6708,7218,6896,5396,2241,2245,2630,2840,2763,1990
16,4113,5889,7351,7643,7926,6338,2714,2562,3002,3160,3527,2784
18,3660,5094,6586,7015,7407,6157,3118,2704,3217,3412,3608,2718


In [16]:
cr_grp_by_2h_and_year.columns  # columns is multi-index

MultiIndex([(  'IS_CRIME', 2012),
            (  'IS_CRIME', 2013),
            (  'IS_CRIME', 2014),
            (  'IS_CRIME', 2015),
            (  'IS_CRIME', 2016),
            (  'IS_CRIME', 2017),
            ('IS_TRAFFIC', 2012),
            ('IS_TRAFFIC', 2013),
            ('IS_TRAFFIC', 2014),
            ('IS_TRAFFIC', 2015),
            ('IS_TRAFFIC', 2016),
            ('IS_TRAFFIC', 2017)],
           )

#### Column is multi index, level0 - crime type, level1 - year

In [17]:
cr_grp_by_2h_and_year.xs('IS_TRAFFIC', axis='columns', level=0).head()  # getting the year data of one column

Unnamed: 0,2012,2013,2014,2015,2016,2017
0,919,792,978,1136,980,782
2,718,652,779,773,718,537
4,399,378,424,471,464,313
6,411,399,479,494,593,462
8,1957,1955,2210,2331,2372,1828


In [18]:
cr_grp_by_2h_and_year.xs(2016, axis='columns', level=1).head()  # getting crime data types for given year

Unnamed: 0,IS_CRIME,IS_TRAFFIC
0,5377,980
2,4091,718
4,3044,464
6,2108,593
8,4488,2372
