In [1]:
#Download libraries
import pandas as pd
import altair as alt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


## Preprocessing script

In [181]:
def preprocess_weather(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to weather information (`Democrats_08_Votes` to
    the end + `climate`). Impute NaN of `Number_of_holidays` 
    by using the values the we have for the same month,
    impute Nan of `Green_2016` by using values found online, or 0, 
    and replace remaining NaN values with 0.
    
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    
    """
    
    #drop the rows with NaNs in the 'unacast_session_count` column
    input_data = input_data.dropna(subset=['unacast_session_count'])
    
    df_weather = input_data.loc[:, 'Democrats_08_Votes':]
    df_weather['state'] = input_data['state']
    df_weather['climate'] = input_data['climate']
    df_weather['external_id'] = input_data['external_id']
    df_weather['month'] = input_data['month']
    df_weather['year'] = input_data['year']
    
    
    #fill up NaNs for `Number_of_holidays` column
    #I sorted the values so that the values are ordered by time, and the NaNs are at the end fo each time period
    df_weather = df_weather.sort_values(['month', 'year', 'Number_of_holidays'])
    df_weather['Number_of_holidays'] = df_weather['Number_of_holidays'].fillna(method='ffill')
    
    #fill up NaNs for the `Green_2016` column
    #I only found values for Alaska and North Carolina, so I just put 0 for the other states
    df_weather['Green_2016'] = np.where(
     df_weather['state'] == 'Alaska', 5735, 
         np.where(
            df_weather['state'] == 'North Carolina', 12105,  
             np.where(
                df_weather['Green_2016'].isnull(), 0, df_weather['Green_2016'] 
             )
         )
    )
    
    #Substitute every remaining NaNs by 0
    df_weather = df_weather.fillna(value=0)
    
    output_data = input_data.copy()
    
    return df_weather

In [182]:
clean_df = clean_weather(df)

In [184]:
clean_df.shape

(49503, 73)

In [161]:
clean_weather(df_weather)['Number_of_holidays'].isnull().sum()/50120

0.0

In [162]:
for month in range(12):
    for year in [2018, 2019]:
        df_jan_19 = clean_df[(clean_df['month'] == month+1) & (clean_df['year'] == year)]
        print(df_jan_19['Number_of_holidays'].unique())

[]
[2.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[1.]
[]
[2.]
[]
[1.]
[]


In [163]:
count_missing_green_state = clean_df[['state', 'Green_2016']][clean_df.isnull()['Green_2016']]['state'].value_counts()

In [164]:
count_missing_green_state

Series([], Name: state, dtype: int64)

In [169]:
for state in ['Indiana']:
    print(clean_df[clean_df['state'] == state]['Green_2016'])

253      0.0
579      0.0
1727     0.0
2802     0.0
2927     0.0
        ... 
42804    0.0
42904    0.0
44680    0.0
46744    0.0
47546    0.0
Name: Green_2016, Length: 720, dtype: float64


In [166]:
Alaska : 5,735
Oklahoma : 0
South Dakota : 0
Nevada : 0
Indiana : 0
North Carolina : 12,105
Georgia : 0

SyntaxError: invalid syntax (<ipython-input-166-4742b228b739>, line 1)

In [167]:
clean_df[['state', 'Green_2016']]

Unnamed: 0,state,Green_2016
3,Mississippi,0.255869
48,Florida,0.642501
89,Kentucky,0.856801
111,Texas,0.895386
183,Texas,0.604434
...,...,...
43630,Washington,1.792781
46263,Hawaii,3.869375
47728,North Carolina,12105.000000
47948,Florida,0.602045


In [168]:
df_weather[['state', 'Green_2016']]

Unnamed: 0,state,Green_2016
0,Virginia,0.966469
1,Virginia,0.966469
2,Virginia,0.966469
3,Mississippi,0.255869
4,Mississippi,0.255869
...,...,...
50115,Texas,0.684419
50116,Texas,0.814837
50117,Texas,1.131838
50118,Louisiana,0.347299


In [176]:
clean_df.shape

(50120, 73)