Stage 1: Data Exploration & Cleansing

The following analysis was performed as part of the first stage of a comparison of Temperature data and Snowfall for key ski resorts in North America. The ultimate goal of the analysis was to assess the following research question:

**"Can Temperature trends be used as an indicator to predict upcoming snowfall"**

The first stage was focussed on initial data exploration and cleansing to determine if the data sets were appropriate to answer the research question. This involved assessing what data was available for analysis, what data types existed and what needed to be converted to enable latter analysis.

Stage 2 will then establish a simple linear regression model that utilises temperature data as an independent variable and assess the strength of the model to make predictions of daily snowfall.

In [None]:
encoding = 'utf8'

## IMPORT LIBRARIES & DEFINE INPUTS
import csv
import itertools
import collections
import matplotlib.pyplot as plt
import warnings
import numpy as np
import re
from tabulate import tabulate
import math
from sklearn import linear_model
from scipy import stats

TemperatureDataInput = '../input/climate-change-earth-surface-temperature-data/GlobalLandTemperaturesByCity.csv'
JacksonDataInput = '../input/resort-daily-snowfall-20092017/Jackson Hole - Wyoming.csv'
SnowbirdDataInput = '../input/resort-daily-snowfall-20092017/Snowbird - Utah.csv'
TellurideDataInput = '../input/resort-daily-snowfall-20092017/Telluride - Colorado.csv'
WhistlerDataInput = '../input/resort-daily-snowfall-20092017/Whistler Blackcomb - BC Canada.csv'

print("Note: Longer processing time may be experienced due to the size of the temperature data set\n")

In [None]:
## DATA SIZE
def RecordCounter(filename, encodingformat):
    RecordCount = 0
    for row in csv.reader(open(filename, encoding=encodingformat)):
        RecordCount += 1
    return RecordCount

print('----- DATA SIZE -----')
print('Temperature Data Records:', RecordCounter(TemperatureDataInput, encoding))
print('Jackson Hole Snowfall Records:', RecordCounter(JacksonDataInput, encoding))
print('Snowbird Snowfall Records:', RecordCounter(SnowbirdDataInput, encoding))
print('Telluride Snowfall Records:', RecordCounter(TellurideDataInput, encoding))
print('Whistler Snowfall Records:', RecordCounter(WhistlerDataInput, encoding))


In [None]:
## INITIAL DATA EXPLORATION
#Given the size of the temperature data set, data exploration is required 
#to determine whether all data within this set is relevant to the analysis. 
#The types of records received, and the volume of records will be analysed below.

print('\n----- INITIAL DATA EXPLORATION -----')

print('\n\nDATA TYPES:')
print('\nTemperature Data:')   
for row in itertools.islice(csv.reader(open(TemperatureDataInput)), 2):
    print(row)

print('\nSnowfall - Jackson Hole:')
for row in itertools.islice(csv.reader(open(JacksonDataInput)), 2):
    print(row)

print('\nSnowfall - Snowbird:')
for row in itertools.islice(csv.reader(open(SnowbirdDataInput)), 2):
    print(row)

print('\nSnowfall - Telluride:')
for row in itertools.islice(csv.reader(open(TellurideDataInput)), 2):
    print(row)

print('\nSnowfall - Whistler:')
for row in itertools.islice(csv.reader(open(WhistlerDataInput)), 2):
    print(row)


# The above demonstrates that the following data cleansing will be required:
# - Conversion of date format in Snowfall Data to enable comparison to Temperature data
# - Manipulation of the three snowfall measurements in Snowfall Data to separate reading from unit of measurement
# - Manupulation of the latitude and longitude readings to separate the measurement and direction
# 
# In addition, all records appear to be string data types, and some fields within the data (e.g. temperature, snowfall) reflect continuous, numeric data, the following will also be required:
# - Conversion of Average Temperature and Average Temperature Uncertainty to float data type
# - Conversion of the snowfall readings subsequent to separation above into either integer or float data type



In [None]:
print("----- FURTHER EXPLORATION - TEMPERATURE RECORDS -----")

print('\n\nTEMPERATURE RECORD DETAILS:')
# Check how many countries within temperature data set
RecordSummary = collections.defaultdict(int)
for row in csv.reader(open(TemperatureDataInput, encoding=encoding)):
    RecordSummary[row[4]] += 1

# print records less heading row
print("\nNumber of Countries in temperature data set:")
print(len(RecordSummary)-1)

# Records per Country
d = RecordSummary
count = sorted(d, key=d.get, reverse=True)

# Most Records per Country
print("\nTemperature Data - Most Records:")
for a in count[:5]:
    print(a +':', RecordSummary[a])

# Least Records per Country (excluding header row)
print("\nTemperature Data - Least Records:")
for b in count[-6:-1]:
    print(b + ':', RecordSummary[b])

# Years in Data Set
YearSummary = collections.defaultdict(int)
for row in csv.reader(open(TemperatureDataInput, encoding=encoding)):
    YearSummary[row[0][0:4]] += 1

YearSummary = dict(YearSummary)
del YearSummary['dt']

x_axis = list(int(item) for item in YearSummary.keys())
y_axis = list(YearSummary.values())

# print records less heading row
print("")
print("Number of Years in Temperature Data Set:", len(YearSummary)-1)
print('First Year in Temperature Data Set:', sorted(x_axis)[0])
print('Last Year in Temperature Data Set:', sorted(x_axis)[-1])
print("")

plt.bar(x_axis, y_axis, color='b')
plt.xlabel('Year')
plt.ylabel('Number of Records')
plt.title('Temperature Records per Year')
plt.show()

# The above exploration of Temperature Data demonstrates that the data set contains records for a large 
# number of locations which will not be examined in this analysis. Those locations also contain a large 
# volume of records which will not be necessary given the focus on US and Canada resorts. Therefore
# temperature data for only the US and Canada will be extracted. Additionally, given the volume of records 
# made available per year, only a subset of years will be considered. The snowfall data appears manageable 
# and hence will be used in its entirety.


In [None]:
## DATA CLEANSING & LOADING

print("----- DATA CLEANSING & LOADING  (SNOWFALL) -----")

# Function - Amend Data Types (iteration)
DEFAULT_VALUE = np.nan
def iter_clean(data, column_key, convert_function, default_value):
    for row in data:
        old_value = row[column_key]
        new_value = default_value
        try:
            new_value = convert_function(old_value)
        except (ValueError, TypeError):
            warnings.warn('Replacing {} with {} in column {}'.format(
                row[column_key], new_value, column_key))
        row[column_key] = new_value
        yield row
        
## Function - Split Snowfall Data Readings
def ReadingSplit(source, column):
    for row in source:
        value, measure = row[column].split(' ')
        row[column] = value
        
## Function - Convert Snowfall Data Dates to appropriate format        
def DateConvert(source):
    months = {'Jan': '01', 'Feb': '02', 'Mar': '03', 'Apr': '04', 'May': '05', 'Jun': '06',
          'Jul': '07', 'Aug': '08', 'Sep': '09', 'Oct': '10', 'Nov': '11', 'Dec': '12'}
    for row in source:
        day, month, year = row['\ufeffDate'].split('-')
        row['\ufeffDate'] = ('20'+ year + '-' + months[month] + '-' + day)

# Snowfall Data Cleansing
data_JacksonHole = list(csv.DictReader(open(JacksonDataInput, encoding=encoding)))
data_Snowbird = list(csv.DictReader(open(SnowbirdDataInput, encoding=encoding)))
data_Telluride = list(csv.DictReader(open(TellurideDataInput, encoding=encoding)))
data_Whistler = list(csv.DictReader(open(WhistlerDataInput, encoding=encoding)))

# Convert Date Fields in Snowfall Data for Loading
DateConvert(data_JacksonHole)
DateConvert(data_Snowbird)
DateConvert(data_Telluride)
DateConvert(data_Whistler)

# Split Snowfall Readings to obtain numeric value
ReadingSplit(data_JacksonHole, '24 hr New Snow')
ReadingSplit(data_Snowbird, '24 hr New Snow') 
ReadingSplit(data_Telluride, '24 hr New Snow') 
ReadingSplit(data_Whistler, '24 hr New Snow') 
ReadingSplit(data_JacksonHole, 'Season Snowfall Total')
ReadingSplit(data_Snowbird, 'Season Snowfall Total') 
ReadingSplit(data_Telluride, 'Season Snowfall Total') 
ReadingSplit(data_Whistler, 'Season Snowfall Total') 
ReadingSplit(data_JacksonHole, 'Base Depth')
ReadingSplit(data_Snowbird, 'Base Depth') 
ReadingSplit(data_Telluride, 'Base Depth') 
ReadingSplit(data_Whistler, 'Base Depth') 

# Convert Snowfall Readings to Float
data_JacksonHole = list(iter_clean(data_JacksonHole, '24 hr New Snow', float, DEFAULT_VALUE))
data_Snowbird = list(iter_clean(data_Snowbird, '24 hr New Snow', float, DEFAULT_VALUE))
data_Telluride = list(iter_clean(data_Telluride, '24 hr New Snow', float, DEFAULT_VALUE))
data_Whistler = list(iter_clean(data_Whistler, '24 hr New Snow', float, DEFAULT_VALUE))
data_JacksonHole = list(iter_clean(data_JacksonHole, 'Season Snowfall Total', float, DEFAULT_VALUE))
data_Snowbird = list(iter_clean(data_Snowbird, 'Season Snowfall Total', float, DEFAULT_VALUE))
data_Telluride = list(iter_clean(data_Telluride, 'Season Snowfall Total', float, DEFAULT_VALUE))
data_Whistler = list(iter_clean(data_Whistler, 'Season Snowfall Total', float, DEFAULT_VALUE))
data_JacksonHole = list(iter_clean(data_JacksonHole, 'Base Depth', float, DEFAULT_VALUE))
data_Snowbird = list(iter_clean(data_Snowbird, 'Base Depth', float, DEFAULT_VALUE))
data_Telluride = list(iter_clean(data_Telluride, 'Base Depth', float, DEFAULT_VALUE))
data_Whistler = list(iter_clean(data_Whistler, 'Base Depth', float, DEFAULT_VALUE))

print('\n\nSnowfall Data Cleansing Complete')
print('\nJacksonHole:', data_JacksonHole[0:1])
print('\nSnowbird:', data_Snowbird[0:1])
print('\nTelluride:', data_Telluride[0:1])
print('\nWhistler:', data_Whistler[0:1])


In [None]:
# Temperature Data Cleansing

# Function - Amend Data Types (pipeline)
DEFAULT_VALUE = np.nan
def piping_clean(value, convert_function, default_value):
    old_value = value
    new_value = default_value
    try:
        new_value = convert_function(old_value)
    except (ValueError, TypeError):
        warnings.warn('Replacing {} with {}'.format(
            old_value, new_value))
    value = new_value
    return value

## Establish pattern for regex to capture date range, and define countries in scope
pattern = '\A195|\A196|\A197|\A198|\A199|\A20'
country = ['United States', 'Canada']

#Apply data conversions to each line, then load relevant records
data_Temperature = list()
for row in csv.reader(open(TemperatureDataInput, encoding=encoding)):
    # Convert Temperature Readings to Float
    row[1] = piping_clean(row[1], float, DEFAULT_VALUE)
    row[2] = piping_clean(row[2], float, DEFAULT_VALUE)
    # Separate Latitude Measure and Direction
    row.append(row[5][-1])  
    row[5] = row[5][0:-1]
    row[5] = piping_clean(row[5], float, DEFAULT_VALUE)
    # Separate Longitude Measure and Direction
    row.append(row[6][-1])
    row[6] = row[6][0:-1]
    row[6] = piping_clean(row[6], float, DEFAULT_VALUE)
    # Load row into Database for in scope years and countries
    if re.match(pattern, row[0]) != None and row[4] in country:
        data_Temperature.append(row)

print('\n\nTemperature Data Cleansing Complete')
print('\nTemperatures:', data_Temperature[0:1])


In [None]:
## DATA VISUALISATIONS

print("----- DATA VISUALISATIONS (TEMPERATURE) -----")

# Function - Create Line Graphs
def linePlot(data, xlabel, ylabel, title):
    x_axis = []
    y_axis = []
    for row in data:
        x_axis.append(row)
        y_axis.append(data[row])
    plt.plot(x_axis, y_axis)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.title(title)
    plot = plt.show()
    return plot

# Canada Average Temperature Trend
Canada_YTemp = dict()
for row in data_Temperature:
    if row[4] == 'Canada':
        if row[0][0:4] in Canada_YTemp:
            Canada_YTemp[row[0][0:4]].append(row[1])
        else:
            Canada_YTemp[row[0][0:4]] = [row[1]]

Canada_AVGTemp = dict()
for row in Canada_YTemp:
    if row in Canada_AVGTemp:
        continue
    else:
        Canada_AVGTemp[row] = np.mean(Canada_YTemp[row])

linePlot(Canada_AVGTemp, 'Year', 'Temperature(C)', 'Average Temperature: Canada')

In [None]:
# United States Average Temperature Trend
USA_YTemp = dict()
for row in data_Temperature:
    if row[4] == 'United States':
        if row[0][0:4] in USA_YTemp:
            USA_YTemp[row[0][0:4]].append(row[1])
        else:
            USA_YTemp[row[0][0:4]] = [row[1]]

USA_AVGTemp = dict()
for row in USA_YTemp:
    if row in USA_AVGTemp:
        continue
    else:
        USA_AVGTemp[row] = np.mean(USA_YTemp[row])

linePlot(USA_AVGTemp, 'Year', 'Temperature(C)', 'Average Temperature: United States')

In [None]:
# Combined Average Temperature Trend
x_axis = []
for row in Canada_AVGTemp:
    x_axis.append(row)   
Canada_Measures = [Canada_AVGTemp[row] for row in Canada_AVGTemp]
USA_Measures = [USA_AVGTemp[row]  for row in USA_AVGTemp]
plt.plot(x_axis, Canada_Measures,'b-', label='Canada')
plt.plot(x_axis, USA_Measures, 'r-', label='USA')
plt.xlabel('Year')
plt.ylabel('Temperature(C)')
plt.title('Average Temperatures per Country')
plt.legend(loc=5)
plot = plt.show()

In [None]:
# Retrieve snowfall data
def TotalAnnualSnowfall(source):
    output = dict()
    for row in source:
        if row['\ufeffDate'][0:4] in output:
            output[row['\ufeffDate'][0:4]] += row['24 hr New Snow']
        else:
            output[row['\ufeffDate'][0:4]] = row['24 hr New Snow']
    return output

Jackson_Snow = TotalAnnualSnowfall(data_JacksonHole)
Snowbird_Snow = TotalAnnualSnowfall(data_Snowbird)
Telluride_Snow = TotalAnnualSnowfall(data_Telluride)
Whistler_Snow = TotalAnnualSnowfall(data_Whistler)
             
x_axis = []
for row in Jackson_Snow:
    x_axis.append(row)   
Jackson_Measures = [Jackson_Snow[row] for row in Jackson_Snow]
Snowbird_Measures = [Snowbird_Snow[row] for row in Snowbird_Snow]
Telluride_Measures = [Telluride_Snow[row] for row in Telluride_Snow]
Whistler_Measures = [Whistler_Snow[row] for row in Whistler_Snow]
plt.plot(x_axis, Jackson_Measures, 'b-', label='Jackson Hole')
plt.plot(x_axis, Snowbird_Measures, 'g-', label='Snowbird')
plt.plot(x_axis, Telluride_Measures, 'r-', label='Telluride')
plt.plot(x_axis, Whistler_Measures, 'y-', label='Whistler')
plt.xlabel('Year')
plt.ylabel('Total Snowfall (cm)')
plt.title('Snowfall')
plt.legend(loc=2)
plot = plt.show()


In [None]:
def AnnualSnowfallRecords(source):
    output = dict()
    for row in source:
        if row['\ufeffDate'][0:4] in output:
            output[row['\ufeffDate'][0:4]].append(row['24 hr New Snow'])
        else:
            output[row['\ufeffDate'][0:4]] = [row['24 hr New Snow']]
    return output
    
def BasicStats(data, year):
    v = []
    for row in data:
        if row == year:
            v.append(data[row])
    output = []
    output.append(year)
    output.append(np.nanmin(v))
    output.append(np.nanmax(v))
    output.append(np.nanmax(v)-np.nanmin(v))
    output.append(np.nanmean(v))
    output.append(np.nanstd(v))
    output.append(np.nanmedian(v))
    output.append(np.nanpercentile(v, 25))
    output.append(np.nanpercentile(v, 75))
    output.append(np.nanpercentile(v, 75)-np.nanpercentile(v, 25))
    return output

headers = ['Min', 'Max', 'Range','Mean','StDev','Median','Q1', 'Q3', 'IQR']  

def LocationStatistics(location, data):
    statistics = []
    for year in range(2009,2018):
        statistics.append(BasicStats(data, str(year)))
    print(location+':', 'Basic Statistics\n')
    print(tabulate(statistics, headers=headers) + '\n')

LocationStatistics('Jackson Hole', AnnualSnowfallRecords(data_JacksonHole))
LocationStatistics('Snowbird', AnnualSnowfallRecords(data_Snowbird))
LocationStatistics('Telluride', AnnualSnowfallRecords(data_Telluride))
LocationStatistics('Whistler', AnnualSnowfallRecords(data_Whistler))

