# Dataset preprocessing

In [1]:
%pylab inline
import pandas as pd
import sys
import warnings
warnings.filterwarnings('ignore')

# Include additional module
include_path = '../include'
if include_path not in sys.path:
    sys.path.append(include_path)
from tensorflow_oop import *

Populating the interactive namespace from numpy and matplotlib


## Clean historical data file

In [2]:
def clean_file(filename):
    """
    Clean historical data file and write result to new file.
    Input:
        filename : string
    Output:
        cleaned_filename : string
    """
    # Parse input file
    with open(filename) as f:
        lines = []
        for line in f:
            # Skip comment lines
            if line[0] != '#':
                line = line.replace('\n', '')
                line = line.replace('\r', '')
                if line[-1] == ';':
                    # Delete last empty delimiter
                    line = line[:-1]
                lines.append(line + '\n')

    # Write result to new file
    cleaned_filename = filename + '.cleaned.csv'
    with open(cleaned_filename, 'w') as f:
        f.writelines(lines)

    return cleaned_filename

In [3]:
FILENAME = '../data/27612.01.01.2005.11.08.2017.1.0.0.en.utf8.00000000.csv'

print 'Cleaning...'
cleaned_filename = clean_file(FILENAME)
print 'Cleaned filename:', cleaned_filename

Cleaning...
Cleaned filename: ../data/27612.01.01.2005.11.08.2017.1.0.0.en.utf8.00000000.csv.cleaned.csv


## Load dataframe

In [4]:
def load_dataframe(filename):
    """
    Load dataframe from csv.
    Input:
        filename : string
    Output:
        df : pandas.core.frame.DataFrame
    """
    # Read file
    df = pd.read_csv(filename, delimiter=';', quotechar='"')
    
    # Convert string to datetime format
    df['time'] = pd.to_datetime(df.iloc[:,0], format='%d.%m.%Y %H:%M')
    
    # Set time as index
    df = df.set_index('time')
    
    # Resample with timedelta 3 hours and fill new rows with NaN
    df = df.resample('3H').asfreq()
    
    # Preprocessing amount of precipitation
    df['RRR'] = df['RRR'].replace('No precipitation', 0.)
    df['RRR'] = df['RRR'].replace('Trace of precipitation', np.nan)
    df['RRR'] = pd.to_numeric(df['RRR'])
    return df

In [5]:
print 'Loading...'
df = load_dataframe(cleaned_filename)
print 'Dataframe shape:', df.shape

df.head()

Loading...
Dataframe shape: (36599, 29)


Unnamed: 0_level_0,Local time in Moscow,T,Po,P,Pa,U,DD,Ff,ff10,ff3,...,Cm,Ch,VV,Td,RRR,tR,E,Tg,E',sss
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2005-02-01 03:00:00,01.02.2005 03:00,-8.6,743.0,758.1,,89.0,Wind blowing from the south-east,2.0,,,...,Altostratus translucidus.,"Cirrocumulus alone, or Cirrocumulus accompanie...",,-10.1,,,,,,
2005-02-01 06:00:00,01.02.2005 06:00,-8.2,742.8,757.9,,90.0,Wind blowing from the south-east,3.0,,,...,Altostratus translucidus.,"Cirrocumulus alone, or Cirrocumulus accompanie...",,-9.6,2.0,12.0,,,,
2005-02-01 09:00:00,01.02.2005 09:00,-8.6,743.5,758.6,,89.0,Wind blowing from the south-east,3.0,,,...,Altostratus translucidus.,"Cirrocumulus alone, or Cirrocumulus accompanie...",4.0,-10.1,3.0,12.0,,,Even layer of loose dry snow covering ground c...,43.0
2005-02-01 12:00:00,01.02.2005 12:00,-7.1,745.0,760.0,,85.0,Wind blowing from the south-east,3.0,,,...,,,10.0,-9.2,,,,,,
2005-02-01 15:00:00,01.02.2005 15:00,-6.6,746.8,761.8,,83.0,Wind blowing from the south-east,3.0,,,...,Altocumulus castellanus or floccus.,"Cirrocumulus alone, or Cirrocumulus accompanie...",4.0,-9.0,,,,,,


## Preprocessing

### Getting features

In [6]:
features = df[['T','Po','U']]
print 'Features shape:', features.shape

Features shape: (36599, 3)


### Interpolate NaN values

In [7]:
nan_count = features.isnull().sum()
features = features.interpolate()
print 'Interpolated NaN values count:\n', nan_count

Interpolated NaN values count:
T     187
Po    290
U     209
dtype: int64


### Normalization

In [8]:
print 'Features mean:\n', features.mean()
print
print 'Features std:\n', features.std()

Features mean:
T       6.698204
Po    747.578297
U      76.997760
dtype: float64

Features std:
T     11.029082
Po     7.558408
U     17.203492
dtype: float64


In [9]:
normed_features = (features - features.mean()) / features.std()
print 'Normed features mean:\n', normed_features.mean()
print
print 'Normed features std:\n', normed_features.std()

Normed features mean:
T    -2.903828e-16
Po    8.321199e-13
U     4.815860e-16
dtype: float64

Normed features std:
T     1.0
Po    1.0
U     1.0
dtype: float64


## Create dataset

In [10]:
def create_sequences(features, sequence_size):
    """
    Create prediction sequences.
    Input:
        features : pandas.core.frame.DataFrame
        sequence_size : int
    Output:
        data : list of sequences
        labels : list of next value
    """
    data = []
    labels = []
    for i in xrange(len(features) - SEQUENCE_SIZE - 1):
        data.append(features.values[i : i + SEQUENCE_SIZE])
        labels.append(features.values[i + SEQUENCE_SIZE])

    return data, labels

### Generate dataset

In [11]:
SEQUENCE_SIZE = 32

print 'Generating dataset...'
data, labels = create_sequences(normed_features, SEQUENCE_SIZE)
dataset = TFDataset(data, labels)
print 'Dataset shape:', dataset.data_shape_, '->', dataset.labels_shape_

Generating dataset...
Dataset shape: [36566, 32, 3] -> [36566, 3]


### Save dump of dataset

In [12]:
DUMP_PATH = '../data/moscow.dump'

print 'Saving dataset...'
dataset.save(DUMP_PATH)
print 'Dataset saved to:', DUMP_PATH

Saving dataset...
Dataset saved to: ../data/moscow.dump


### Load dump of dataset

In [13]:
print 'Loading dataset...'
dataset.load(DUMP_PATH)
print 'Loaded dataset shape:', dataset.data_shape_, '->', dataset.labels_shape_

Loading dataset...
Loaded dataset shape: [36566, 32, 3] -> [36566, 3]
