In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('data/train_bike_sharing.csv')
df['datetime'] = pd.to_datetime(df['datetime'])

In [None]:
df['atemp_humidity'] = (df['atemp'] * df['humidity']) / 10 # maths on columns to create a new column
df[['atemp', 'humidity', 'atemp_humidity']].head(2)

In [None]:
df['no_wind'] = df['windspeed'] != 0 # comparison
df.head(2)

In [None]:
df['no_wind'] = (df['windspeed'] != 0 ).astype(int) # 0/1 is more compatible with plotting than True/False
df.head(2)

In [None]:
# or = |, and = & (unlike in the rest of python)
df['no_wind_and_hot'] = ((df['windspeed'] != 0 ) & (df['atemp'] > 20)).astype(int)
df['no_wind_or_hot'] = ((df['windspeed'] != 0 ) | (df['atemp'] > 20)).astype(int)
df.head(2)

In [None]:
# .shift(n) shifts a column by n forwards - useful when calculating/detecting differences in time

df['diff_atemp'] = df['atemp'] - df['atemp'].shift(1)
df[['atemp', 'diff_atemp']].head()

In [None]:
df['mean'] = df.mean(axis=1)
df.head()

In [None]:
# we can use map to apply custom functions and pass additional arguments
def weird_func(x, val_1, val_2):
    if x < val_1:
        return 'lalala'
    elif x > val_2:
        return 'blablabla'
    else:
        return 'wtf'
    
df['status'] = df['atemp_humidity'].map(lambda x: weird_func(x, 100, 1100))
# df[['status', 'atemp_humidity']].head(100)

In [None]:
# df.apply() with axis=1 applies a function to rows
df['status'] = df.apply(lambda row: weird_func(row['registered'], row['casual'], row['humidity']), axis=1)
df[['registered', 'casual', 'humidity', 'status']].head()

In [None]:
# apply can also be used to create summaries of columns
cols = ['season', 'holiday', 'workingday', 'weather', 'temp']
df[cols].apply(lambda x: np.mean(x))

In [None]:
# selecting subset of df
sub_df = df[df['season'] == 1].copy()
# sub_df

In [None]:
# setting values on a subset of a data frame
# df.loc[df['season'] == 1, 'blABL']
df[df['season'] == 2].head(2)

In [None]:
# df.rolling(window, min_periods=None, center=False, win_type=None, on=None, axis=0, closed=None)
# or series.rolling(...)
df['smooth_atemp'] = df['atemp'].rolling(10, min_periods=3).mean() # some functions are builtin (more efficient)
df['smooth_atemp'] = df['atemp'].rolling(10, min_periods=3).apply(lambda win: np.mean(win)) # can apply custom functions
df[['atemp', 'smooth_atemp']].head(10)

In [None]:
# centering
df['smooth_atemp'] = df['atemp'].rolling(10, center=True).mean()
df[['atemp', 'smooth_atemp']].head(10)

In [None]:
# window type
df['smooth_atemp'] = df['atemp'].rolling(10, center=True, min_periods=0, win_type='triang').mean() # some functions are builtin
df[['atemp', 'smooth_atemp']].head(10)

In [None]:
# setting datetime as an index
df.set_index('datetime', inplace=True)
df.head()

In [None]:
# time-defined window
df['smooth_atemp'] = df['atemp'].rolling('5H').mean() # default
df['smooth_atemp_cent'] = df['atemp'].rolling('5H').mean().shift(-2.5, freq='H') # centering doesn't work with time windows

df[['atemp', 'smooth_atemp', 'smooth_atemp_cent']].head(10)

In [None]:
# creating toy time-series
index = pd.date_range('2018-05-24', periods=9, freq='min')
s = pd.Series(range(9), index=index)
s

In [None]:
# downsampling
s.resample('3T').asfreq() # .sum(), etc.

In [None]:
# upsampling
s.resample('30S').asfreq()

In [None]:
# upsample and interpolate to fill blanks
s.resample('30S').asfreq().interpolate()

In [None]:
df = pd.read_csv('data/train_bike_sharing.csv')
df['datetime'] = pd.to_datetime(df['datetime'])

# groupby 
grouped = df.groupby('season')
grouped

In [None]:
df.columns

In [None]:
# for name, group in grouped:
#     print('group name:', name, '\n\n\n\n', group)
#     break

In [None]:
df['mean_atemp'] = df.groupby('season')['atemp'].transform(lambda x: np.mean(x))
df[['season', 'mean_atemp']]

In [None]:
# groupby time and apply functions to get new df
func_mapping = {
    'windspeed':'mean',
    'atemp':'median',
    'holiday':'sum',
}

grouped_2 = df.groupby('season').agg(func_mapping)
grouped_2

In [None]:
func_mapping = {
    'windspeed':'mean',
    'atemp':'median',
    'count':'sum',
}

grouped_3 = df.groupby(pd.Grouper(key='datetime', freq='7D')).agg(func_mapping)
grouped_3