# Learning material

This notebook is for handy code snippets.

In [1]:
import pandas as pd
import numpy as np

### Error handling

#### Try, except

In [2]:
total_marks = int(input('Enter total marks'))
num_sections = int(input('Enter number of sections'))
try:
    marks_per_section = total_marks / num_sections
except ZeroDivisionError:
    print('The number of sections cannot equal 0')
else:
    print(marks_per_section)

The number of sections cannot equal 0


#### Assert

In [3]:
total_marks = int(input('Enter total marks'))
num_sections = int(input('Enter number of sections'))
assert(num_sections != 0), 'The number of sections cannot equal 0' 
marks_per_section = total_marks / num_sections

AssertionError: The number of sections cannot equal 0

### Pandas

#### String formatting

In [None]:
a = 'Rachel'
b = 'Georgia'
d = 'Charlotte'
"The story of {0}, {1}, and {c}".format(a, b, c=d)

#### Shift

In [None]:
df = pd.DataFrame({'balance': [15, 21, 23, 24], 
                  'value': [5, 6, 2, 1],})

cols = ['value', 'balance']
df = df[cols]

df['manual balance'] = df['balance'].shift() + df['value']
df['check balance'] = df['balance'] == df['manual balance']
df

#### Cumulative sum and groupby

In [None]:
test = pd.DataFrame({'period': [1,1,1,2,2,3,3],
              'cost': [10,20,15,10,5,20,5]})

test = test[['period', 'cost']]
test['cumulative_sum'] = test.groupby('period')['cost'].cumsum()
test

#### Index of minimum/maximum

In [None]:
df = pd.DataFrame(np.random.random(size=(5, 10)), columns=list('abcdefghij'))
df

In [None]:
df.sum(axis=0).idxmin()

#### Creating random numbers

In [None]:
df = pd.DataFrame(data = {'A': list(range(1, 101)), 
                     'B': np.random.randint(low = 0, high = 100, size = 100)})
df.head(10)

In [None]:
pd.DataFrame(data = np.random.random(size = (10, 5)), 
            columns = ['oh','laddergoat', 'you', 'so', 'random'])

In [None]:
# Keep same random numbers
np.random.seed(99)
df = pd.DataFrame(data = np.random.random(size = (10, 5)),
             columns = list('abcde'))
df

#### Assigning values in a dataframe to bins

In [57]:
df = pd.DataFrame(data = {'A': np.random.randint(low = 0, high = 100, size = 15), 
                     'B': np.random.randint(low = 0, high = 100, size = 15)})
df

Unnamed: 0,A,B
0,60,21
1,23,26
2,47,61
3,55,84
4,84,21
5,16,90
6,49,34
7,81,19
8,41,2
9,74,18


In [58]:
my_range = np.arange(0,101,10)
my_range

array([  0,  10,  20,  30,  40,  50,  60,  70,  80,  90, 100])

In [59]:
pd.cut(df['A'],my_range)

0      (50, 60]
1      (20, 30]
2      (40, 50]
3      (50, 60]
4      (80, 90]
5      (10, 20]
6      (40, 50]
7      (80, 90]
8      (40, 50]
9      (70, 80]
10      (0, 10]
11     (60, 70]
12    (90, 100]
13     (40, 50]
14    (90, 100]
Name: A, dtype: category
Categories (10, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

#### Conditional formatting

##### Highlight null values

In [None]:
df = pd.DataFrame(data = np.random.randint(low = 0, high = 9, size = (10, 5)), 
                  columns = list('abcde'))
df = df.replace(0, np.nan)
df.style.highlight_null(null_color='#ccccff')

##### Create heatmap

In [None]:
import seaborn as sns
df = pd.DataFrame(data = np.random.random(size = (10, 5)), 
                  columns = list('abcde'))
cm = sns.light_palette("green", as_cmap=True)
s = df.style.background_gradient(cmap=cm)
s

##### Highlight min and max

In [None]:
df.style.highlight_max(axis=0, color = 'pink')
# df.style.highlight_min(axis=0, color = 'blue')

#### Stack and unstack

In [None]:
df = pd.DataFrame({'a': (1.0, 2.0),
                  'b': (3.0, 4.0)})
df

In [None]:
s = df.stack()
s

In [None]:
s.unstack()

##### Creating multi-index dataframes

In [None]:
index = pd.MultiIndex.from_tuples([('one', 'a'), ('one', 'b'),
                                    ('two', 'a'), ('two', 'b')])
s = pd.Series(np.arange(1.0, 5.0), index=index)
s

#### Lambda expressions vs custom functions

In [54]:
df = pd.DataFrame({'a': range(1, 4),
                   'b': range(5, 8)})
df

Unnamed: 0,a,b
0,1,5
1,2,6
2,3,7


In [55]:
# Apply/transform using lambda function
df.apply(lambda x: x + 1)
df.transform(lambda x: x + 1)

# Note - both of these seem to behave as if inplace = False were selected

Unnamed: 0,a,b
0,2,6
1,3,7
2,4,8


In [56]:
# Applying transform using custom function
def add_one(s):
    s += 1
    return s

df.apply(add_one)
df.transform(add_one)

# Note - both of these seem to behave as if inplace = True were selected 

Unnamed: 0,a,b
0,3,7
1,4,8
2,5,9


#### Apply vs transform

#### Rolling

In [62]:
df = pd.DataFrame({'period': [1,1,1,2,2,3,3],
              'cost': [10,20,15,10,5,20,5]})
df

Unnamed: 0,cost,period
0,10,1
1,20,1
2,15,1
3,10,2
4,5,2
5,20,3
6,5,3


In [67]:
df['cost'].rolling(3).mean()

0          NaN
1          NaN
2    15.000000
3    15.000000
4    10.000000
5    11.666667
6    10.000000
Name: cost, dtype: float64

In [68]:
df['cost'].rolling(2).sum()

0     NaN
1    30.0
2    35.0
3    25.0
4    15.0
5    25.0
6    25.0
Name: cost, dtype: float64

#### Resampling time series

In [70]:
index = pd.date_range('1/1/2000', periods=9, freq='T')
s = pd.Series(range(9), index=index)
s

2000-01-01 00:00:00    0
2000-01-01 00:01:00    1
2000-01-01 00:02:00    2
2000-01-01 00:03:00    3
2000-01-01 00:04:00    4
2000-01-01 00:05:00    5
2000-01-01 00:06:00    6
2000-01-01 00:07:00    7
2000-01-01 00:08:00    8
Freq: T, dtype: int64

In [73]:
# Downsampling into three minute intervals
s.resample('3T').sum()

2000-01-01 00:00:00     3
2000-01-01 00:03:00    12
2000-01-01 00:06:00    21
Freq: 3T, dtype: int64

In [76]:
# Upsampling into thirty second intervals
s.resample('30S').asfreq().head(5)

2000-01-01 00:00:00    0.0
2000-01-01 00:00:30    NaN
2000-01-01 00:01:00    1.0
2000-01-01 00:01:30    NaN
2000-01-01 00:02:00    2.0
Freq: 30S, dtype: float64