In [2]:
import pandas as pd
import numpy as np
df = pd.read_csv('data/weather_data.csv', parse_dates=['day'])
df.set_index('day', inplace=True)

In [124]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-04,,9.0,Sunny
2022-01-05,28.0,,Snow
2022-01-06,,7.0,
2022-01-07,32.0,,Rain
2022-01-08,,,Sunny
2022-01-09,,,
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [125]:
# with df.fillna(), we can specify different values for different columns
# just pass a dictionary into the fillna method
new_df = df.fillna({
    'temperature': 0,
    'windspeed': 0,
    'event': 'No event'
})
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-04,0.0,9.0,Sunny
2022-01-05,28.0,0.0,Snow
2022-01-06,0.0,7.0,No event
2022-01-07,32.0,0.0,Rain
2022-01-08,0.0,0.0,Sunny
2022-01-09,0.0,0.0,No event
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [126]:
# https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.fillna.html
new_df = df.fillna(method="ffill") # ffill carries over values to fill NaNs/NAs.

new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-04,32.0,9.0,Sunny
2022-01-05,28.0,9.0,Snow
2022-01-06,28.0,7.0,Snow
2022-01-07,32.0,7.0,Rain
2022-01-08,32.0,7.0,Sunny
2022-01-09,32.0,7.0,Sunny
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [127]:
new_df[0:3]['temperature'].std()

2.3094010767585034

In [128]:
# Fill NaN values using an interpolation method.
# time method takes into account missing days.
# if the temperature changes linearly, the missing temp on 2022-01-04 should
# be closer to 2022-01-05 than 2022-01-01 and NOT equally spaced between the two
new_df = df.interpolate(method='time') 

new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-04,29.0,9.0,Sunny
2022-01-05,28.0,8.0,Snow
2022-01-06,30.0,7.0,
2022-01-07,32.0,7.25,Rain
2022-01-08,32.666667,7.5,Sunny
2022-01-09,33.333333,7.75,
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [129]:
# you can also dropna with a specified axis, with default being 0
new_df = df.dropna() # will drop rows with missing values
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [130]:
# dropna with axis = 1 will drop columns with missing values
example = {
    'data': ['3/1/2022', '3/2/2022', '3/3/2022'],
    'temperature': [65, 71, 72],
    'windspeed': [12, 8, np.nan],
    'event': ['Sunny', 'Sunny', 'Sunny']
}
example_df = pd.DataFrame(example)
example_df.dropna(axis='columns', inplace=True)
example_df

Unnamed: 0,data,temperature,event
0,3/1/2022,65,Sunny
1,3/2/2022,71,Sunny
2,3/3/2022,72,Sunny


In [131]:
df.dropna(thresh=1) # thresh=number of non-na values required otherwise dropped
# do not forget, dropna requires inplace=True

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32.0,6.0,Rain
2022-01-04,,9.0,Sunny
2022-01-05,28.0,,Snow
2022-01-06,,7.0,
2022-01-07,32.0,,Rain
2022-01-08,,,Sunny
2022-01-10,34.0,8.0,Cloudy
2022-01-11,40.0,12.0,Sunny


In [154]:
# How do we inserting the missing dates?
# first create a datetimeindex using pd.date_range
dt = pd.date_range('01-01-2022', '01-11-2022') 
# dt will be a range of equally spaced time points, type DatetimeIndex

# reindex() conforms Series/DataFrame to new index with optional filling logic.
# Places NA/NaN in locations having no value in the previous index:
new_df = df.reindex(dt)

In [143]:
new_df

Unnamed: 0,temperature,windspeed,event
2022-01-01,32.0,6.0,Rain
2022-01-02,,,
2022-01-03,,,
2022-01-04,29.0,9.0,Sunny
2022-01-05,28.0,8.0,Snow
2022-01-06,30.0,7.0,
2022-01-07,32.0,7.25,Rain
2022-01-08,32.666667,7.5,Sunny
2022-01-09,33.333333,7.75,
2022-01-10,34.0,8.0,Cloudy


In [156]:
# DataFrame.replace(to_replace=, value=, ...)
# by providing a dictionary, you can specify values you want to change in a specific column
# ex: look for 'Sunny', 'Rain', and np.nan in column 'event' and replace with 'Cloudy'
new_df = new_df.replace({'event':['Sunny', 'Rain', np.nan]}, 'Cloudy')
new_df

Unnamed: 0,temperature,windspeed,event
2022-01-01,32.0,6.0,Cloudy
2022-01-02,,,Cloudy
2022-01-03,,,Cloudy
2022-01-04,29.0,9.0,Cloudy
2022-01-05,28.0,8.0,Snow
2022-01-06,30.0,7.0,Cloudy
2022-01-07,32.0,7.25,Cloudy
2022-01-08,32.666667,7.5,Cloudy
2022-01-09,33.333333,7.75,Cloudy
2022-01-10,34.0,8.0,Cloudy


In [157]:
# if you don't provide a value arg, the dict passed as an arg simply 
# replaces instances of key with it's associated value
new_df = new_df.replace({
    32: 'thirty two',
    np.nan: 'Not a number'
})
new_df

Unnamed: 0,temperature,windspeed,event
2022-01-01,thirty two,6.0,Cloudy
2022-01-02,Not a number,Not a number,Cloudy
2022-01-03,Not a number,Not a number,Cloudy
2022-01-04,29.0,9.0,Cloudy
2022-01-05,28.0,8.0,Snow
2022-01-06,30.0,7.0,Cloudy
2022-01-07,thirty two,7.25,Cloudy
2022-01-08,32.666667,7.5,Cloudy
2022-01-09,33.333333,7.75,Cloudy
2022-01-10,34.0,8.0,Cloudy


In [163]:
# imagine your values have units you need to trim off
# you can use regex to trim these values
# however, we need to make sure we only apply this to certain functions
new_df = pd.read_csv('data/weather_data_with_units.csv', parse_dates=['day'])
new_df.set_index('day', inplace=True)
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32 F,6 mph,Snow
2022-01-02,27 F,11 mph,Sunny
2022-01-03,20 F,14 mph,Sunny


In [164]:
# since we ARE providing a "value" arg as '', the dictionary ends up specifying what values
# we are looking for and the specified columns to look inside, NOT a find:replace_with mapping
new_df = new_df.replace({
    'temperature': '[A-Za-z]',
    'windspeed': '[A-Za-z]'
    },'',regex=True)
new_df # Notice event column is intact but units have been removed

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2022-01-01,32,6,Snow
2022-01-02,27,11,Sunny
2022-01-03,20,14,Sunny


In [166]:
df = pd.DataFrame({
    'score': ['great', 'good', 'neutral', 'bad', 'neutral'],
    'student': ['diana', 'conrad', 'edison', 'bird', 'kelly' ]
})
df

Unnamed: 0,score,student
0,great,diana
1,good,conrad
2,neutral,edison
3,bad,bird
4,neutral,kelly


In [168]:
# how do we replce the score with their respective numeric values?
df.replace(['bad', 'neutral', 'good', 'great'], [1, 2, 3, 4], inplace=True)
df

Unnamed: 0,score,student
0,4,diana
1,3,conrad
2,2,edison
3,1,bird
4,2,kelly


In [4]:
s = pd.Series([3, 1, 2, 3, 4, np.nan])

s.value_counts(normalize=False, dropna=False)  # value_counts() works like Counter in python.

3.0    2
1.0    1
2.0    1
4.0    1
NaN    1
dtype: int64

In [5]:
df = pd.DataFrame({'num_legs': [2, 4, 4, 6],

                   'num_wings': [2, 0, 0, 0]},

                  index=['falcon', 'dog', 'cat', 'ant'])

df

Unnamed: 0,num_legs,num_wings
falcon,2,2
dog,4,0
cat,4,0
ant,6,0


In [11]:
df.value_counts()

# The output basically tells us how many times a row showed up.
# For example, there are two rows in our dataframe where the values for num_legs and num_wings were 4 and 0 respectively.
# therefore, our output will convey that

num_legs  num_wings
4         0            2
2         2            1
6         0            1
dtype: int64

In [12]:
# note - the output is a multi-indexed
print(type(df.value_counts()))

<class 'pandas.core.series.Series'>


In [13]:
print(df.value_counts().index)

MultiIndex([(4, 0),
            (2, 2),
            (6, 0)],
           names=['num_legs', 'num_wings'])
