In [1]:
from Series import Speed, Direction
from Time import Hours, Date, DateTime
from DataFrame import PriceDemand, Cities, City, DataFrame
from config import COLUMNS

WEATHER_DIR = "./weather/"

import math
import pandas as pd
from datetime import date, datetime, time, timedelta
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
import numpy as np

In [55]:
class Cities():
    def __init__(self):
        self.df = CitiesDataFrame()
        self.fake_values = pd.Series(dtype='object')
    
    def mean_fill_nans(self, columns):
        current_fakes = self.df.fill_nans(columns)
        self.fake_values = pd.concat([self.fake_values, *current_fakes])
    
    def get_fake_values(self, column):
        fake_value_indexes = self.fake_values.loc[self.fake_values == column].index
        return self.df.loc[fake_value_indexes][column]
        
        

class CitiesDataFrame(DataFrame):
    """
        Container all of the cities' weather data.
    """
    
    @staticmethod
    def get_all_cities():
        """
            Loads all city weather dataframes and returns an array containing (city_df, state_code) tuples.
            i.e [ (city1, statecode1), (city2, statecode2) ... ]
        """
        CITY_SUFFIXES = ["_melbourne", "_sydney", "_adelaide", "_brisbane"]
        STATE_CODES = ["VIC1", "NSW1", "SA1", "QLD1"]
        CITY_DIRS = [WEATHER_DIR + "weather" + suffix + ".csv" for suffix in CITY_SUFFIXES]
        cities = []
        for i in range(len(CITY_DIRS)):
            #[(df, statecode), (df, statecode)]
            cities.append((pd.read_csv(CITY_DIRS[i]), STATE_CODES[i]))    
        return cities
    
    
    def process_df(self, *args):
        cities_dataframes = CitiesDataFrame.get_all_cities()
        City_array = [City(*city) for city in cities_dataframes]
        Cities_dataframe = pd.concat(City_array, axis=0).reset_index()
        return Cities_dataframe
    


    def join_price(self, grouped_price_demand):
        #returns a new df which is the join of cities and price demand
        return self.join(grouped_price_demand, on=["REGION", "Date"], how='right')
    
    
    
    def nan_processor(self, nan_region, nan_date, column):
    
        region = self.loc[self["REGION"] == nan_region]
        lower_bound = nan_date - timedelta(days=10)
        upper_bound = nan_date + timedelta(days=10)
        fill_value = region.loc[(region["Date"] > lower_bound) & (region["Date"] < upper_bound)][column].mean()

        if(math.isnan(fill_value)):
            fill_value = self.loc[(self["Date"] > lower_bound) & (self["Date"] < upper_bound)][column].mean()
        return fill_value

    
    def fill_nans(self, cols):
        current_fakes = []
        for col in cols:
            nans = self.loc[self[col].isna()][["REGION", "Date"]]
            nans_filled = nans.transpose().apply(lambda region_and_date: self.nan_processor(*region_and_date, col))


            if nans_filled.index.size > 2:
                current_fakes.append(pd.Series(data=col, index=nans_filled.index))
                self.loc[nans_filled.index, col] = nans_filled
        return current_fakes
                

price surge for one day
price surge for 9am and 3pm seperately 

price_surge_score_3pm = for all price surges in that day, take sum(1 / | time of surge - 3pm | )


fake_value_lookup = {"Evaporation" : [row_index1, row_index2, ... ]}

testing correlations between features


test correlation of features with a label 


1) process features, normalise, fill missing etc..

2) test different approaches for choosing a label





In [56]:
cities = Cities()

In [57]:
cities.mean_fill_nans(["Time of maximum wind gust", "Speed of maximum wind gust (km/h)"])

In [59]:
cities.get_fake_values("Time of maximum wind gust")

156     42765.882353
157     41809.411765
423     49053.333333
605     52056.666667
650     47685.000000
651     48446.250000
652     48536.250000
706     55380.000000
817     51303.333333
847     46480.000000
854     51452.000000
890     53853.333333
913     53813.333333
1019    42638.823529
1020    41265.882353
1104    47853.333333
1223    43500.000000
1271    59226.666667
1404    41516.666667
1453    48854.117647
1459    52867.058824
1478    52775.294118
1479    53647.058824
1505    51582.352941
1506    50664.705882
1558    55362.352941
1559    54494.117647
1624    50423.333333
1695    50386.666667
Name: Time of maximum wind gust, dtype: float64