<a href="https://colab.research.google.com/github/presidentlines/AvalancheVol3/blob/main/resampled_df.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Team AVYULAUNCH
#### E R N K L
#### v o  a  a e
#### e c  t   n e
#### r k  h   e
#### e f  a
#### t o  n
#### t r
####   d

# 11/13/2021
### Combine weather and avalanche data!

In [None]:
import pandas as pd
import numpy as np

In [None]:
pd.set_option('display.max_rows', 20)

# Weather from NOAA Database
df_weather = pd.read_csv('weather with provo.csv')

# Just keeping core features
df_weather = df_weather[['NAME', 'DATE', 'PRCP', 'SNWD', 'TMAX', 'TMIN']]

# Get month dummy variables
df_weather['DATE'] = pd.to_datetime(df_weather['DATE'])
df_weather['MONTH'] = pd.DatetimeIndex(df_weather['DATE']).month
df_weather['MONTH'] = df_weather['MONTH'].astype(str)
# df_weather = pd.get_dummies(df_weather, columns=['MONTH'])


# Create Region variable so we can merge with Avalanche dataset
def assign_region(name):
    if name == "BEN LOMOND PEAK, UT US": return "Ogden"
    if name == "ALTA, UT US": return "Salt Lake"
    if name == "BEN LOMOND TRAIL, UT US": return "Ogden"
    if name == "MONTE CRISTO, UT US": return "Logan"
    if name == "BUES CANYON UTAH, UT US": return "Ogden"
    if name == "RAY S VALLEY UTAH, UT US": return "Uintas"
    if name == "SNOWBIRD, UT US": return "Salt Lake"
    if name == "PROVO BYU, UT US": return "Provo"

# Create snow difference by Weather station NAME
df_list = []
by_location = df_weather.groupby('NAME')
for name, group in by_location:
    # Assign each weather station its region
    group['Region'] = assign_region(name)
    
    # Snow depth of the day before minus the current day
    group['snow_diff_day'] = group['SNWD'] - group['SNWD'].shift(1)
    # Change in snow over the last week
    group['snow_diff_week'] = group['SNWD'] - group['SNWD'].shift(7)
    
    # Binary saying if we got snow from the day before or not
    group['got_snow'] = (group['snow_diff_day'] > 0).astype(int) 
    
    # Previous day's TMAX and TMIN
    group['prev_day_TMIN'] = group['TMIN'].shift(1)
    group['prev_day_TMAX'] = group['TMAX'].shift(1)
    
    df_list.append(group)
df_weather = pd.concat(df_list)

# Create indicator for if it was below freezing at any point that day
df_weather['min_below_freezing'] = (df_weather['TMIN'] < 32).astype(int)
# Create indicator for if it was above freezing at any point that day
df_weather['max_above_freezing'] = (df_weather['TMAX'] > 32).astype(int)

# min * max means:
# 1 if min below freezing and max above freezing
# 0 otherwise
# This is potentially significant if we cross the freezing point of water in a day
df_weather['min*max'] = df_weather['min_below_freezing'] * df_weather['max_above_freezing']



print(df_weather.columns)

Index(['NAME', 'DATE', 'PRCP', 'SNWD', 'TMAX', 'TMIN', 'MONTH', 'Region',
       'snow_diff_day', 'snow_diff_week', 'got_snow', 'prev_day_TMIN',
       'prev_day_TMAX', 'min_below_freezing', 'max_above_freezing', 'min*max'],
      dtype='object')


In [None]:
# Clean up avalanche data
df_avalanche = pd.read_csv('avalanches, 11-13-2021.csv')
# Delete rows without date or region
df_avalanche = df_avalanche[['Date', 'Region']].dropna()
# Convert to date time
df_avalanche['DATE'] = pd.to_datetime(df_avalanche['Date'])
df_avalanche['Avalanche'] = 1

print(df_avalanche.columns)

Index(['Date', 'Region', 'DATE', 'Avalanche'], dtype='object')


In [None]:
# Combine avalanche and weather on region and date
df_combined = pd.merge(df_weather, df_avalanche,  how='left', on=['Region', 'DATE'])

In [None]:
# Create variable of summed up avalanches per day by Weather Station NAME
summed = df_combined.groupby(['NAME', 'DATE'])['Avalanche'].agg('sum').reset_index()
summed['avalanche_sum'] = summed['Avalanche']

# Add new column back onto original dataframe
reassembled = pd.merge(summed, df_combined, how='left', on=['NAME', 'DATE'])
reassembled = reassembled.drop_duplicates()

# Create new column: binary indicator if there was an avalanche that day or not
reassembled['avalanche_binary'] = reassembled['avalanche_sum'] > 0

# Final clean up: drop unnecessary columns
df_final = reassembled.drop(['Avalanche_x', 'Avalanche_y', 'Date', 'NAME'], axis=1)

# Create region dummy variables
df_final = pd.get_dummies(df_final, columns=['Region'], drop_first=True)

# Drop rows with nan variables
df_final = df_final.dropna()
print(df_final.columns)

df_final = df_final[df_final['DATE'] > '2010-01-01']

df_final.to_csv("FINAL_DF.csv")

Index(['DATE', 'avalanche_sum', 'PRCP', 'SNWD', 'TMAX', 'TMIN', 'MONTH',
       'snow_diff_day', 'snow_diff_week', 'got_snow', 'prev_day_TMIN',
       'prev_day_TMAX', 'min_below_freezing', 'max_above_freezing', 'min*max',
       'avalanche_binary', 'Region_Ogden', 'Region_Provo', 'Region_Salt Lake',
       'Region_Uintas'],
      dtype='object')


In [None]:
################## UPSAMPLING / DOWNSAMPLING ##################

# If there are 3,000 weather observations and 15 avalanche dates, 
# we will have 99.5% accuracy with a model that just classifies
# everything as No Avalanche. So, we'll downsample our observations
# so we have as many weather observations for non-avalanche days
# as we have for avalanche days. 
# If there aren't over 100 observations for avalanche days, we will
# upsample those and take a random draw of 100 from those observations.

# Potential here for augmenting those data points slightly. 
# We'd only want to slightly perturb the following columns:
# 'PRCP', 'SNWD', 'TMAX', 'TMIN', 'snow_diff_day', 'snow_diff_week'
# We'd just want to be careful of TMAX or TMIN being around 32 since
# that could throw off our data. 
# Another idea: only changing the region of the observations, seeing
# if that affects anything. 

group_list = []
for month, group in df_final.groupby('MONTH'):
    # Separate avalanched days from non avalanche days
    avi_days = group[group['avalanche_binary'] == 1]
    non_avi_days = group[group['avalanche_binary'] == 0]
    
    # If there aren't very many avalanche days, upsample those
    num_samples = len(avi_days)
    if num_samples < 100:
        num_samples = 100
        
    # Check if we have any avalanches that month
    # otherwise we can leave those months out entirely
    if len(avi_days) > 0:
        # Get random sample of size num_samples which is max(100, # of avalanches)
        avi_sample_index = np.random.choice(avi_days.index, num_samples)
        # Add those to our new dataframe
        group_list.append(avi_days.loc[avi_sample_index])
        
        #  If we take these two lines out of this if statement,
        # we end up adding 100 rows to our dataframe from the 
        # summer months where we never have avalanches
        non_avi_sample_index = np.random.choice(non_avi_days.index, num_samples, replace=False)
        group_list.append(non_avi_days.loc[non_avi_sample_index])
    
# Compile new dataframe from each month's sample
df_resampled = pd.concat(group_list)

In [None]:
df = pd.get_dummies(df_resampled, columns=['MONTH'], drop_first=True)
df.to_csv("RESAMPLED_DF.csv")

In [None]:
# LOGISTIC REGRESSION
X = df_final.drop(['avalanche_sum', 'avalanche_binary', 'DATE'], axis=1)
y = df_final['avalanche_binary']
print("% observations w/o avalanches:", round((len(X) - sum(y)) * 100 / len(X), 1))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(random_state=0, max_iter=1e4)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)

print("% predictions of an avalanche:", round(sum(y_hat) * 100 / len(y_hat), 1))
print("% correct predictions:", round(sum(y_hat == y_test) * 100 / len(y_hat), 1))
print("# of avalanche predictions:", sum(y_hat))

% observations w/o avalanches: 88.3
% predictions of an avalanche: 6.8
% correct predictions: 90.6
# of avalanche predictions: 335


In [None]:
# LOGISTIC REGRESSION
X = df.drop(['avalanche_sum', 'avalanche_binary', 'DATE'], axis=1)
y = df['avalanche_binary']
print("% observations w/o avalanches:", round((len(X) - sum(y)) * 100 / len(X), 1))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(random_state=0, max_iter=1e4)
clf.fit(X_train, y_train)
y_hat = clf.predict(X_test)

print("% predictions of an avalanche:", round(sum(y_hat) * 100 / len(y_hat), 1))
print("% correct predictions:", round(sum(y_hat == y_test) * 100 / len(y_hat), 1))
print("# of avalanche predictions:", sum(y_hat))

% observations w/o avalanches: 52.3
% predictions of an avalanche: 46.1
% correct predictions: 74.1
# of avalanche predictions: 598
