In [1]:
#Predict the Sunrise!
#Author: Ryan Kennedy
#Purpose: Get a rudimentary introduction to data science by trying to predict the time of the sunrise.
#Special thanks to John Miner (http://craftydba.com/) for providing the data for this experiment, and Mike Griffin for 
#the direction!

In [2]:
#Imports for all requisite libraries
import pandas as pd
import glob
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import time
import datetime



In [3]:
#Can be toggled on and off based on what you want to display. 
#pd.set_option('display.max_rows', 5000000)

In [4]:
#defining a function to calculate seconds from midnight
def get_sec(time_str):
    h, m = time_str.split(':')
    return int(h) * 3600 + int(m) * 60 

def get_time(time_sec):
    converted_time = time.strftime('%H:%M', time.gmtime(time_sec))
    return converted_time

def get_mins(time_str):
    h, m = time_str.split(':')
    return int(h) * 60 + int(m) 

def get_time_from_mins(time_mins):
    time_sec = time_mins * 3600
    converted_time = time.strftime('%H:%M', time.gmtime(time_sec))
    return converted_time

In [5]:
#Load all data into dataframe from the folder
path =r'C:\Users\rkennedy\projects\Data Science\Predict the Sunrise\SunRiseSet\SunRiseSet\outbound' # use your path
allFiles = glob.glob(path + "/*.csv")
frame = pd.DataFrame()
list_ = []
for file_ in allFiles:
    df = pd.read_csv(file_,index_col=None, header=0)
    #add year based on file name substring
    df['year']=file_[100:104]
    list_.append(df)
frame = pd.concat(list_)

In [6]:
#remove junk rows   
frame = frame.loc[frame['sunrise1'] != 'Obse']
frame=frame.dropna()

#Drop sunset data as it is irrelevant for our model.
frame = frame.drop(['sunset1'], axis=1)

Empty DataFrame
Columns: [month1, day1, sunrise1, sunset1, year]
Index: []


In [None]:
frame['sunrise_convert']=frame['sunrise1'].apply(get_sec) #convert into minutes. split

In [None]:
frame.head()

In [None]:
frame.dtypes

In [None]:
frame.describe()

In [None]:
#Convert columns to string to build the date column
frame['month']=frame['month1'].astype('str')
frame['day']=frame['day1'].astype('str')

frame.head()


In [None]:
#build the Date column by applying a lambda join function
frame['Date'] = frame[['month','day','year']].apply(lambda x:'/'.join(x), axis=1)
frame.head()

In [None]:
#plot one year of data to check out how it looks.
df=pd.DataFrame(frame)
df=frame.loc[frame["year"] == '1993']

#convert to proper datetime format
df['Date'] = pd.to_datetime(df['Date'])
df['sunrise1'] = pd.to_datetime(df['sunrise1'], format='%H:%M').dt.time

#plot
plt.plot(df["Date"],df["sunrise1"])
plt.gcf().autofmt_xdate()
plt.xlabel('Date')
plt.ylabel('Sunrise Time')
plt.title('Date by Sunrise Time')
plt.show()



In [None]:
#convert date and plot ALL data to make sure it looks accurate
frame['Date'] = pd.to_datetime(frame['Date'])
frame['sunrise_time'] = pd.to_datetime(frame['sunrise1'], format='%H:%M').dt.time

#df
#plot
plt.plot(frame["Date"],frame["sunrise_time"])
plt.gcf().autofmt_xdate()
plt.xlabel('Date')
plt.ylabel('Sunrise Time')
plt.title('Date by Sunrise Time')
plt.show()


In [None]:
#take another look at our data frame to see what we are dealing with
frame.head()

In [None]:
#frame.dtypes
#train_size = int(len(X) * 0.8)
#x_train, x_test = X[0:train_size], X[train_size:len(X)]
#y_train, y_test = Y[0:train_size], Y[train_size:len(Y)]
#Convert columns back to int so they can be passed into 
#frame['month']=frame['month'].astype('int')
#frame['day']=frame['day'].astype('int')

In [None]:
#Split dataframe into two, one for training and one for testing.
df_test = pd.DataFrame()
df_train = pd.DataFrame()

#We want to use 70% of our model for training, 30% for testing
df_train, df_test = np.split(frame, [int(.7*len(frame))])
df_train.head()

In [None]:
#Slice data frame into arrays to pass into the model
x_train = df_train.iloc[:,0:2].values
x_test  = df_test.iloc[:,0:2].values
y_train = df_train.iloc[:,4].values
y_test = df_test.iloc[:,4].values




In [None]:
#Make sure things look correct
x_train
y_train

In [None]:
df_test.head()



In [None]:
##Train the model! 

model = XGBClassifier()
model.fit(x_train, y_train)
print(model)

In [None]:
#Test the model on our test dataset!
y_pred = model.predict(x_test)

In [None]:
#Show raw accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))


In [None]:
df_test.head()

In [None]:

#Build a dataframe from the predictions
df_y_pred = pd.DataFrame(y_pred, columns=['y_pred'])

#re-index both dataframes so that when we concatenate them, they stay in the proper order
df_y_pred = df_y_pred.reset_index()
df_test = df_test.reset_index()

#Add predictions column to test dataframe
df_test['prediction'] = df_y_pred['y_pred']

df_test.head()


In [None]:
df_test.head()
df_test.dtypes

In [None]:
#format predictions back to actual time
df_test['prediction_time']=df_test['prediction'].apply(get_time)
df_test['prediction_time'] = pd.to_datetime(df_test['prediction_time'], format='%H:%M').dt.time
df_test.head()


In [None]:

plt.plot(df_test["Date"], df_test["sunrise_time"], df_test["Date"], df_test["prediction_time"])

plt.gcf().autofmt_xdate()
plt.xlabel('Date')
plt.ylabel('Sunrise Time')
plt.title('Date by Sunrise Time - Predictions VS Actuals')
plt.show()

In [None]:
#Zoom in to take a closer look at just one year
subset_df = df_test[(df_test['year'] == '2015')]# & (df_test['month'] == '7')]# | (df_test['month'] == 8) | (df_test['month'] == 9) ) ]

subset_df

plt.plot(subset_df["Date"], subset_df["sunrise_time"], subset_df["Date"], subset_df["prediction_time"])

plt.gcf().autofmt_xdate()
plt.xlabel('Date')
plt.ylabel('Sunrise Time')
plt.title('Date by Sunrise Time - Predictions VS Actuals')
plt.show()

In [None]:
#Zoom in even closer to see a month
subset_df = df_test[(df_test['year'] == '2015') & (df_test['month'] == '7')]# | (df_test['month'] == 8) | (df_test['month'] == 9) ) ]

subset_df

plt.plot(subset_df["Date"], subset_df["sunrise_time"], subset_df["Date"], subset_df["prediction_time"])
plt.xlabel('Date')
plt.ylabel('Sunrise Time')
plt.title('Date by Sunrise Time - Predictions VS Actuals')
plt.gcf().autofmt_xdate()
plt.show()


Lessons Learned:
1. Pay attention to object types / datatypes. Machine learning models require specific data types in specific formats, so
make sure you read the model's documentation. Even something as simple as plotting a time series graph will be thrown off if your
x axis is not in the proper datetime format.
2. Indexing when merging / concatenating dataframes.  If you are trying to combine data frames, or add columns from one to
another, the index is key if you want to keep the rows in a specific order.
3. Keep in mind context of what you are predicting, and what can be considered  a 'good' model.  My raw accuracy with this model
is ~71%, but when you look at the predicted times vs the actual times, the predictions were EXTREMELY close.  In the context of
predicting the sunrise, this model would work fine.  
4. Data munging... you have heard it is most of the work, and it is true.  I had the model built and tested within an hour,
but dealing with data types / objects / munging / visualizations took a LOT longer.
5. Make sure data is accurate - shows the importance of discovery phase of data science.  My first model was abysmal, and it was 
because my source data was wrong.

70% of the time I am within at least .5 seconds of the sunrise time



Next Steps:

1. Add code to Azure Machine Learning and set it up as an API call
2. Convert code to PySpark and get running in Data Bricks on Azure


Key Links:

Pandas cheat sheet: https://www.datacamp.com/community/blog/python-pandas-cheat-sheet
Microsoft Machine Learning Algorithm cheat sheet: https://docs.microsoft.com/en-us/azure/machine-learning/studio/algorithm-cheat-sheet
Anaconda: https://www.anaconda.com/download/