In [1]:
import numpy as np
import pandas as pd
from pprint import pprint as pp
import json, os, sys
import matplotlib.pyplot as plt
import seaborn as sns
import random
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.cross_validation import train_test_split
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier



In [2]:
random.seed(0)
%matplotlib inline

df = pd.read_csv('weather_data/training_data/input_data_nov.csv')

# sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
df = df.iloc[:,1:-2]
df = df.dropna(axis=0,how='any')
df[['Day','Hour','Humidity']] = df[['Day','Hour','Humidity']].astype(int)
df[['Temp','DewPt','Pressure']] = df[['Temp','DewPt','Pressure']].astype(float)
# sns.heatmap(df.isnull(),yticklabels=False,cbar=False,cmap='viridis')
df.insert(1, 'Position', '')
df.rename(columns={'City': 'Location','Day':'Date'}, inplace=True)

geo = { 
'Sydney': [-33.86,151.20,19],
'Melbourne': [-37.66,144.84,124],
'Brisbane': [-27.47,153.02,28],
'Gold_Coast': [-28.01,153.42,3.9],
'Adelaide': [-34.92,138.59,44.7],
'Darwin': [-12.46,130.84,37],
'Wollongong': [-34.42,150.89,19],
'Canberra': [-35.28,149.12,576.7],
'Newcastle': [-32.92,151.77,12.81],
'Hobart': [-42.83,147.50,6]
}

df['Position'] = [geo[city]  for city in df['Location']]
weather_conditions= { 
    'Rain' : ['Light Rain',
              'Light Rain Showers',
              'Light Drizzle',
              'Light Thunderstorms and Rain',
              'Heavy Rain Showers',
              'Unknown Precipitation',
              'Thunderstorms and Rain',
              'Rain',
              'Rain Showers',
              'Thunderstorm',
              'Heavy Thunderstorms and Rain',
              'Heavy Rain',
              'Drizzle',
              'Heavy Drizzle'],
    'Cloudy':['Mostly Cloudy',
              'Partly Cloudy',
              'Overcast',
              'Scattered Clouds'
             ],
    'Clear':['Clear'],
    'Snow':['Snow']
}

def simplify(cond):
    if cond in weather_conditions['Rain']:
        cond = 'Rain'
    elif cond in weather_conditions['Clear']:
        cond = 'Clear'
    elif cond in weather_conditions['Snow']:
        cond = 'Snow'
    elif cond in weather_conditions['Cloudy']:
        cond = 'Cloudy'
    else:
        cond = 'Unknown'
    return cond

df['Condition'] = df['Condition'].apply(simplify)
df = df[df['Condition'] != 'Unknown']
df = df[['Date','Hour','Location','Position','Condition','Pressure','Temp','Humidity','DewPt']]
df = df.sort_values(['Location','Date','Hour'])

tmp = pd.DataFrame({'Time':['{:02}:{:02}:{:02}'.format(hr,random.randint(0,60),random.randint(0,60)) for hr in df['Hour'] ],
                        'Date':['2017-11-{:02}'.format(day) for day in df['Date']]
                        })

df.insert(4,'LocalTime','')
df['LocalTime'] = tmp['Date'] + ' ' + tmp['Time']

In [3]:
# To feed sklearn I have to convert the 'conditions' text data to a numeric value
labels = {}

for label, condition in enumerate( df['Condition'].unique() ):
    labels[condition] = label

df['label'] = df['Condition'].apply(lambda c: labels.get(c))
label_map = { v:k for k,v in labels.items() }


In [4]:
def learn(trainingData):


    # trainingY = pd.get_dummies(trainingData['Condition'], 'Condition')
    trainingY = trainingData['label']

    trainingData.drop(['Location','Position','LocalTime','Condition','DewPt','label'], axis=1, inplace=True)

    # Split my data into train and test to avoid overfiting
    X_train, X_test, Y_train, Y_test = train_test_split(trainingData, trainingY)

    #  I will train a Support Vector Machine classifier
    #     note: I tried with a Logistic Regression but I only got 68% accuracy

    # classifier = SVC()
    # classifier = SVC(kernel='rbf', verbose=True)
    classifier = SVC(kernel='poly',degree=2)
    # classifier = LogisticRegression(C=1e5)
    # classifier = KNeighborsClassifier()


    classifier.fit(X=X_train, y=Y_train)

    # Now I'll check the accuracy of my model
    train_ac = classifier.score(X=X_train, y=Y_train)
    test_ac = classifier.score(X=X_test, y=Y_test)

    print('Training accuracy: {}'.format(train_ac))
    print('Testing accuracy: {}'.format(test_ac))
    return classifier
#     s = {'a':classifier}

#     s['a'].predict([[2,3,3,3,3]])

In [5]:
city_predictor = { }
for city in df['Location'].unique():
    trainingData = df[ df['Location'] == city ].copy()
    classifier = learn(trainingData)
    city_predictor.update({city:classifier})


Training accuracy: 0.8579088471849866
Testing accuracy: 0.816
Training accuracy: 0.775
Testing accuracy: 0.7142857142857143
Training accuracy: 0.739938080495356
Testing accuracy: 0.7222222222222222
Training accuracy: 0.7547169811320755
Testing accuracy: 0.717741935483871
Training accuracy: 0.8986486486486487
Testing accuracy: 0.8686868686868687
Training accuracy: 0.6612021857923497
Testing accuracy: 0.6885245901639344
Training accuracy: 0.6811989100817438
Testing accuracy: 0.6585365853658537
Training accuracy: 0.9273504273504274
Testing accuracy: 0.9230769230769231
Training accuracy: 0.8253968253968254
Testing accuracy: 0.8571428571428571
Training accuracy: 0.9401709401709402
Testing accuracy: 0.975


In [7]:
city_predictor['Adelaide'].predict([[0,0,0,0,0]])

array([1])

In [8]:
predicted_value = city_predictor['Adelaide'].predict([[0,0,0,0,0]])[0]

In [9]:
label_map[predicted_value]

'Rain'