In [1]:
import numpy as np
import pandas as pd

### This notebook contains the following topics:
1. Window Functions
2. Aggregations
3. Missing data

#### Window Functions:
- Window functions are majorly used in finding the trends within the data graphically by smoothing the curve. 
- If there is lot of variation in the everyday data and a lot of data points are available, then taking the samples and plotting is one method and applying the window computations and plotting the graph on the results is another method. 
- By these methods, we can smooth the curve or the trend.

#### .rolling() Function:
- Specify the window = n argument and apply the appropriate statistical function on top of it.

In [15]:
df = pd.DataFrame([[1,2,3,4],[5,6,7,8],[2,3,7,3],[6,3,8,4],[2,6,3,2]], index = pd.date_range('1/1/2000', periods = 5), 
                  columns = list('ABCD'))
df

Unnamed: 0,A,B,C,D
2000-01-01,1,2,3,4
2000-01-02,5,6,7,8
2000-01-03,2,3,7,3
2000-01-04,6,3,8,4
2000-01-05,2,6,3,2


In [28]:
df.rolling(window = 3).sum()

Unnamed: 0,A,B,C,D
2000-01-01,,,,
2000-01-02,,,,
2000-01-03,8.0,11.0,17.0,15.0
2000-01-04,13.0,12.0,22.0,15.0
2000-01-05,10.0,12.0,18.0,9.0


#### .expanding() function:
- Specify the min_periods=n argument and apply the appropriate statistical function on top of it.

In [27]:
df

Unnamed: 0,A,B,C,D
2000-01-01,1,2,3,4
2000-01-02,5,6,7,8
2000-01-03,2,3,7,3
2000-01-04,6,3,8,4
2000-01-05,2,6,3,2


In [26]:
df.expanding(min_periods = 3).sum()

Unnamed: 0,A,B,C,D
2000-01-01,,,,
2000-01-02,,,,
2000-01-03,8.0,11.0,17.0,15.0
2000-01-04,14.0,14.0,25.0,19.0
2000-01-05,16.0,20.0,28.0,21.0


#### Aggregations:

In [10]:
df = pd.DataFrame([[1,2,3,4],[4,3,5,7],[9,7,5,3],[6,8,1,6],[7,2,8,5],[4,7,2,9],[2,5,7,3],[1,4,6,2]], 
                  index = pd.date_range('1/1/2000', periods=8), columns = ['A', 'B', 'C', 'D'])
print(df)

            A  B  C  D
2000-01-01  1  2  3  4
2000-01-02  4  3  5  7
2000-01-03  9  7  5  3
2000-01-04  6  8  1  6
2000-01-05  7  2  8  5
2000-01-06  4  7  2  9
2000-01-07  2  5  7  3
2000-01-08  1  4  6  2


In [17]:
r = df.rolling(window = 3, min_periods = 2)
r

Rolling [window=3,min_periods=2,center=False,axis=0]

In [18]:
r.aggregate(np.sum)

Unnamed: 0,A,B,C,D
2000-01-01,,,,
2000-01-02,5.0,5.0,8.0,11.0
2000-01-03,14.0,12.0,13.0,14.0
2000-01-04,19.0,18.0,11.0,16.0
2000-01-05,22.0,17.0,14.0,14.0
2000-01-06,17.0,17.0,11.0,20.0
2000-01-07,13.0,14.0,17.0,17.0
2000-01-08,7.0,16.0,15.0,14.0


In [20]:
# applying aggregation on single column of a dataframe

r = df.rolling(window = 3, min_periods = 2)
print(r['A'].aggregate(np.sum))

2000-01-01     NaN
2000-01-02     5.0
2000-01-03    14.0
2000-01-04    19.0
2000-01-05    22.0
2000-01-06    17.0
2000-01-07    13.0
2000-01-08     7.0
Freq: D, Name: A, dtype: float64


In [28]:
# applying aggregation on multiple columns

r = df.rolling(window = 3,min_periods = 2)
print(r[['A','B']].aggregate(np.sum))

               A     B
2000-01-01   NaN   NaN
2000-01-02   5.0   5.0
2000-01-03  14.0  12.0
2000-01-04  19.0  18.0
2000-01-05  22.0  17.0
2000-01-06  17.0  17.0
2000-01-07  13.0  14.0
2000-01-08   7.0  16.0


In [34]:
print(df)
df.rolling(3).agg({'A':'sum', 'B':'min'})

            A  B  C  D
2000-01-01  1  2  3  4
2000-01-02  4  3  5  7
2000-01-03  9  7  5  3
2000-01-04  6  8  1  6
2000-01-05  7  2  8  5
2000-01-06  4  7  2  9
2000-01-07  2  5  7  3
2000-01-08  1  4  6  2


Unnamed: 0,A,B
2000-01-01,,
2000-01-02,,
2000-01-03,14.0,2.0
2000-01-04,19.0,3.0
2000-01-05,22.0,2.0
2000-01-06,17.0,2.0
2000-01-07,13.0,2.0
2000-01-08,7.0,4.0


In [30]:
# Apply Multiple Functions on a Single Column of a DataFrame

r = df.rolling(window=3, min_periods = 1)
print(r['A'].aggregate([np.sum, np.mean]))

             sum      mean
2000-01-01   1.0  1.000000
2000-01-02   5.0  2.500000
2000-01-03  14.0  4.666667
2000-01-04  19.0  6.333333
2000-01-05  22.0  7.333333
2000-01-06  17.0  5.666667
2000-01-07  13.0  4.333333
2000-01-08   7.0  2.333333


In [32]:
# Apply Multiple Functions on a multiple Column of a DataFrame

r = df.rolling(window=3, min_periods = 1)
print(r[['A','B']].aggregate([np.sum, np.mean]))

               A               B          
             sum      mean   sum      mean
2000-01-01   1.0  1.000000   2.0  2.000000
2000-01-02   5.0  2.500000   5.0  2.500000
2000-01-03  14.0  4.666667  12.0  4.000000
2000-01-04  19.0  6.333333  18.0  6.000000
2000-01-05  22.0  7.333333  17.0  5.666667
2000-01-06  17.0  5.666667  17.0  5.666667
2000-01-07  13.0  4.333333  14.0  4.666667
2000-01-08   7.0  2.333333  16.0  5.333333


In [35]:
# applying different functions on multiple columns

r = df.rolling(window=3,min_periods=1)
print(r.aggregate({'A' : np.sum, 'B' : np.mean}))

               A         B
2000-01-01   1.0  2.000000
2000-01-02   5.0  2.500000
2000-01-03  14.0  4.000000
2000-01-04  19.0  6.000000
2000-01-05  22.0  5.666667
2000-01-06  17.0  5.666667
2000-01-07  13.0  4.666667
2000-01-08   7.0  5.333333


#### Missing data:

In [2]:
df = pd.DataFrame(np.random.randn(5, 3), index=['a', 'c', 'e', 'f', 'h'], columns = ['one', 'two', 'three'])
print(df)

        one       two     three
a -0.715688  0.434351 -0.148183
c  0.313286  0.628856 -0.835151
e  0.454992 -0.633485 -0.605200
f  0.402142  1.297945  0.043444
h  0.583053  0.683130 -0.428688


In [5]:
df = df.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df)

        one       two     three
a -0.715688  0.434351 -0.148183
b       NaN       NaN       NaN
c  0.313286  0.628856 -0.835151
d       NaN       NaN       NaN
e  0.454992 -0.633485 -0.605200
f  0.402142  1.297945  0.043444
g       NaN       NaN       NaN
h  0.583053  0.683130 -0.428688


In [6]:
# checking for null values in an entire dataframe
df.isnull()

Unnamed: 0,one,two,three
a,False,False,False
b,True,True,True
c,False,False,False
d,True,True,True
e,False,False,False
f,False,False,False
g,True,True,True
h,False,False,False


In [10]:
# checking null values on a single column
df['one'].isnull()

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool

In [11]:
df['one'].notnull()

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool

#### Calculations with Missing Data:
- When summing data, NA will be treated as Zero
- If the data are all NA, then the result will be NA

In [12]:
df['one'].isnull().sum()       # there are 3 null values in 'one' column

3

#### Cleaning / Filling Missing Data:
- Pandas provides various methods for cleaning the missing values. 
- The fillna function can “fill in” NA values with non-null data in a couple of ways.

In [13]:
df = pd.DataFrame(np.random.randn(3,3), index = ['a', 'c', 'e'], columns = ['one', 'two', 'three'])
df = df.reindex(['a', 'b', 'c'])
print(df)

        one       two     three
a  0.301379  1.704607 -0.926052
b       NaN       NaN       NaN
c -1.821765  1.124366  1.654936


In [17]:
# filling NaN value with 0

print(df.fillna(0))

        one       two     three
a  0.301379  1.704607 -0.926052
b  0.000000  0.000000  0.000000
c -1.821765  1.124366  1.654936


In [20]:
# filling Nan value with Forward and Backward fill

df.fillna(method = 'ffill')

Unnamed: 0,one,two,three
a,0.301379,1.704607,-0.926052
b,0.301379,1.704607,-0.926052
c,-1.821765,1.124366,1.654936


In [21]:
df.fillna(method = 'pad')

Unnamed: 0,one,two,three
a,0.301379,1.704607,-0.926052
b,0.301379,1.704607,-0.926052
c,-1.821765,1.124366,1.654936


In [26]:
df1 = pd.DataFrame([[np.nan, 2, np.nan, 0], [3, 4, np.nan, 1],[np.nan, np.nan, np.nan, 5],[np.nan, 3, np.nan, 4]], 
                  columns = list('ABCD'))
print(df1)

     A    B   C  D
0  NaN  2.0 NaN  0
1  3.0  4.0 NaN  1
2  NaN  NaN NaN  5
3  NaN  3.0 NaN  4


In [28]:
# filling all NaN elements in column 'A','B','C','D', with 0,1,2, and 3 respectively.

values = {'A': 0, 'B': 1, 'C': 2, 'D': 3}
df1.fillna(value = values)

Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,2.0,1
2,0.0,1.0,2.0,5
3,0.0,3.0,2.0,4


In [31]:
# Only replace the first NaN element.

print(df1)
df1.fillna(value = values, limit = 1)

     A    B   C  D
0  NaN  2.0 NaN  0
1  3.0  4.0 NaN  1
2  NaN  NaN NaN  5
3  NaN  3.0 NaN  4


Unnamed: 0,A,B,C,D
0,0.0,2.0,2.0,0
1,3.0,4.0,,1
2,,1.0,,5
3,,3.0,,4


#### Dropping missing values:
- Use the 'dropna' function to drop missing values along with the axis argument. 
- By default, axis=0, i.e., along row, which means that if any value within a row is NA then the whole row is excluded.

In [32]:
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'], "toy": [np.nan, 'Batmobile', 'Bullwhip'], 
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"), pd.NaT]})
df

Unnamed: 0,name,toy,born
0,Alfred,,NaT
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [33]:
# Drop the rows where at least one element is missing.
df.dropna()

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


In [34]:
# Drop the columns where at least one element is missing.
df.dropna(axis='columns')

Unnamed: 0,name
0,Alfred
1,Batman
2,Catwoman


In [38]:
print(df.dropna(how = 'all'),'\n')    # Drop the rows where all elements are missing           
print(df.dropna(how = 'any'))         # Drop the row where any of the elements are missing

       name        toy       born
0    Alfred        NaN        NaT
1    Batman  Batmobile 1940-04-25
2  Catwoman   Bullwhip        NaT 

     name        toy       born
1  Batman  Batmobile 1940-04-25


In [39]:
# Keep only the rows with at least 2 non-NA values.

df.dropna(thresh = 2)

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25
2,Catwoman,Bullwhip,NaT


In [43]:
# Define in which columns to look for missing values.

df.dropna(subset = ['name', 'born'])

Unnamed: 0,name,toy,born
1,Batman,Batmobile,1940-04-25


#### Replace Missing (or) Generic Values:

- Many times, we have to replace a generic value with some specific value. We can achieve this by applying the 'replace' method.
- Replacing NA with a scalar value is equivalent behavior of the fillna() function.

In [69]:
# Scalar `to_replace` and `value`

s = pd.Series([0,2,3,4,5])
s

0    0
1    2
2    3
3    4
4    5
dtype: int64

In [70]:
s.replace(to_replace = 0, value = 1)

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [72]:
s1 = pd.Series([10, 'a', 'a', 'b', 'a'])
s1

0    10
1     a
2     a
3     b
4     a
dtype: object

In [75]:
s1.replace({'a': None})

0      10
1    None
2    None
3       b
4    None
dtype: object

When one uses a dict as the `to_replace` value, it is like the value(s) in the dict are equal to the `value` parameter.
- ``s.replace({'a': None})`` is equivalent to
- ``s.replace(to_replace={'a': None}, value=None, method=None)``

In [74]:
s1.replace('a', None)

0    10
1    10
2    10
3     b
4     b
dtype: object

When ``value=None`` and `to_replace` is a scalar, list or tuple, `replace` uses the method parameter (default 'pad') to do the
replacement. So this is why the 'a' values are being replaced by 10 in rows 1 and 2 and 'b' in row 4 in this case.
- The command ``s.replace('a', None)`` is actually equivalent to
- ``s.replace(to_replace='a', value=None, method='pad')``

In [50]:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4], 'B': [0, 6, 7, 8, 9], 'C': ['a', 'b', 'c', 'd', 'e']})
df

Unnamed: 0,A,B,C
0,0,0,a
1,1,6,b
2,2,7,c
3,3,8,d
4,4,9,e


In [51]:
df.replace(0, 100)

     A    B  C
0  100  100  a
1    1    6  b
2    2    7  c
3    3    8  d
4    4    9  e


In [52]:
# List-like `to_replace`
df.replace([1,2,3,4], 200)

Unnamed: 0,A,B,C
0,0,0,a
1,200,6,b
2,200,7,c
3,200,8,d
4,200,9,e


In [54]:
df.replace([1,2,3,4], [100,200,300,400])

Unnamed: 0,A,B,C
0,0,0,a
1,100,6,b
2,200,7,c
3,300,8,d
4,400,9,e


In [55]:
df.replace({0: 10, 1: 100})

Unnamed: 0,A,B,C
0,10,10,a
1,100,6,b
2,2,7,c
3,3,8,d
4,4,9,e


In [58]:
df.replace({'A': 0, 'B': 9}, 100)

Unnamed: 0,A,B,C
0,100,0,a
1,1,6,b
2,2,7,c
3,3,8,d
4,4,100,e


In [62]:
df.replace({'A': {0: 100, 4: 400}, 'B': {7: 700, 8: 800}})

Unnamed: 0,A,B,C
0,100,0,a
1,1,6,b
2,2,700,c
3,3,800,d
4,400,9,e
