# SVR model

## Import libraries and download the data

In [30]:
#Download libraries
import pandas as pd
import altair as alt
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [4]:
#Download the data
df = pd.read_csv('../data/train_data.zip')
df.head()

Unnamed: 0,external_id,month,year,monthly_number_of_sessions,monthly_unique_sessions,monthly_repeated_sessions,monthly_avg_length_of_session,monthly_avg_light_activity,monthly_avg_moderate_activity,monthly_avg_vigorous_activity,...,avg_wind_9_10,avg_wind_10_11,avg_wind_11_12,avg_wind_12_above,perfect_days,unacast_session_count,hpi,state_and_local_amount_per_capita,state_amount_per_capita,local_amount_per_capita
0,1900203,3,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,78.0,323.61,0.132207,0.018519,0.113688
1,1900203,6,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,111.0,323.61,0.132207,0.018519,0.113688
2,1900203,8,2018,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,110.0,323.61,0.132207,0.018519,0.113688
3,MR00101775,1,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,110.38,0.076247,0.011966,0.064281
4,MR00101775,8,2019,0,0,0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,11.0,110.38,0.076247,0.011966,0.064281


# Preprocessing script

In [5]:
def preprocess_weather(input_data):
    """
    Given the original dataframe, preprocess the columns
    related to weather information (`Democrats_08_Votes` to
    the end + `climate`). Impute NaN of `Number_of_holidays` 
    by using the values the we have for the same month,
    impute NaN of `Green_2016` by using values found online, or 0, 
    and replace remaining NaN values with 0.
    
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    
    """
    
    #drop the rows with NaNs in the 'unacast_session_count` column
    input_data = input_data.dropna(subset=['unacast_session_count'])
    
    df_weather = input_data.loc[:, 'Democrats_08_Votes':]
    df_weather['state'] = input_data['state']
    df_weather['climate'] = input_data['climate']
    df_weather['external_id'] = input_data['external_id']
    df_weather['month'] = input_data['month']
    df_weather['year'] = input_data['year']
    
    
    #fill up NaNs for `Number_of_holidays` column
    #I sorted the values so that the values are ordered by time, and the NaNs are at the end of each time period
    df_weather = df_weather.sort_values(['month', 'year', 'Number_of_holidays'])
    df_weather['Number_of_holidays'] = df_weather['Number_of_holidays'].fillna(method='ffill')
    
    #fill up NaNs for the `Green_2016` column
    #I only found values for Alaska and North Carolina, so I just put 0 for the other states
    df_weather['Green_2016'] = np.where(
     df_weather['state'] == 'Alaska', 5735, 
         np.where(
            df_weather['state'] == 'North Carolina', 12105,  
             np.where(
                df_weather['Green_2016'].isnull(), 0, df_weather['Green_2016'] 
             )
         )
    )
    
    #Substitute every remaining NaNs by 0
    df_weather = df_weather.fillna(value=0)
    
    output_data = input_data.copy()
    output_data.loc[:, 'Democrats_08_Votes':] = df_weather.loc[:, 'Democrats_08_Votes':]
    output_data['climate'] = df_weather['climate']
    
    return output_data

This [page](https://en.wikipedia.org/wiki/2016_United_States_presidential_election_in_North_Carolina) is where I found the value for North Carolina, and [this](https://www.nytimes.com/elections/2016/results/alaska) is where I found the results for Alaska.

## Test for the preprocessing function

In [7]:
clean_df = preprocess_weather(df)

In [8]:
clean_df.shape

(49503, 861)

In [20]:
#Check that there are no missing values in the `Number_of_holidays` column
clean_df['Number_of_holidays'].isnull().sum()

0

In [22]:
#Check that every month has only one value for the `Number_of_holiday` column
number_of_error = 0
for month in range(12):
    for year in [2018, 2019]:
        df_jan_19 = clean_df[(clean_df['month'] == month+1) & (clean_df['year'] == year)]
        if len(df_jan_19['Number_of_holidays'].unique()) > 1:
            number_of_error += 1 
print(number_of_error)

0


In [26]:
#Check that the values in the `Green_2016` column for the states of North Carolina and Alaska are the right ones
for state in ['North Carolina', 'Alaska']:
    print(clean_df[clean_df['state'] == state]['Green_2016'].head(1))

60    12105.0
Name: Green_2016, dtype: float64
3121    5735.0
Name: Green_2016, dtype: float64


In [27]:
#Check that the values that were not NaNs in the `Green_2016` column remained the same
clean_df[['state', 'Green_2016']].iloc[3]

state         Mississippi
Green_2016       0.255869
Name: 3, dtype: object

In [29]:
df[['state', 'Green_2016']].iloc[3]

state         Mississippi
Green_2016       0.255869
Name: 3, dtype: object