# Preprocessing and standard scaler

In [37]:
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler
from joblib import dump, load

In [2]:
train_weather_df = pd.read_csv('./data/train_data.csv')
test_weather_df = pd.read_csv('./data/test_data.csv')

In [23]:
def preproc(df):
    df.rename(columns={'Max temperature (°C)': 'Max_temp'}, inplace=True)
    df.rename(columns={'Min temperature (°C)': 'Min_temp'}, inplace=True)
    df.rename(columns={'Wind (m/s)': 'Wind_m/s'}, inplace=True)
    df.rename(columns={'Precipitation (mm)': "Precipitation_mm"}, inplace=True)
    df = df.assign(Mean_temp = lambda x: (x['Min_temp'] + x['Max_temp'])/2)
    df = df.drop(['Date'], axis = 1)
    return df

In [28]:
traindf = preproc(train_weather_df)
testdf = preproc(test_weather_df)

In [30]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(traindf)
test_scaled = scaler.transform(testdf)
train_df = pd.DataFrame(train_scaled, index = traindf.index, columns = traindf.columns)
test_df = pd.DataFrame(test_scaled, index = testdf.index, columns = testdf.columns)

In [33]:
scaler.mean_

array([16.88940346,  9.56091084,  4.60012829,  1.78005131, 13.22515715])

In [34]:
scaler.var_

array([100.81968053,  70.15311373,   3.82111608,  17.52236664,
        82.99070262])

In [35]:
scaler.scale_

array([10.04090038,  8.37574556,  1.95476753,  4.1859726 ,  9.1099233 ])

In [48]:
dump(scaler, './preproc/stdscaler.bin', compress=True)

['./preproc/stdscaler.bin']

# Sarah's functions

In [33]:
# def createTimeShiftedDfFromAllFeatures(numPrevDaysWeNeedInfoFor, df):
#     columns=['Max_temp', 'Min_temp', 'Wind_m/s', 'Precipitation_mm'] # change to df.getAllColumns
#     # Have each row contain weather data from last 7 days
#     for i in range(numPrevDaysWeNeedInfoFor):
#         for colName in columns:
#             # Shift down i+1 (if wanna shift up need to make it (-)) rows and place it in a just-created col all the way to the right 
#             # which gets name 'colName + (i+1)'
#             df[colName+str(i+1)]=df.shift(i+1)[colName]
#     df=df.dropna().reset_index(drop=True)
#     df.head()
#     return df

In [41]:
def createTimeShiftedDfFromFeatures(numPrevDaysWeNeedInfoFor, df, features):
    columns=features 
    # Have each row contain weather data from last 7 days
    for i in range(numPrevDaysWeNeedInfoFor):
        for colName in columns:
            # Shift down i+1 (if wanna shift up need to make it (-)) rows and place it in a just-created col all the way to the right 
            # which gets name 'colName + (i+1)'
            df[colName+str(i+1)]=df.shift(i+1)[colName]
    # print('#### Before drop na' + str(len(df))) # For debugging
    # of dropped columns will be numPrevDaysWeNeedInfoFor, since the first n days won't have all the numPrevDaysWeNeedInfoFor days' info
    df=df.dropna().reset_index(drop=True) 
    # print('#### After drop na' + str(len(df))) # For debugging
    return df

In [42]:
'''Add target to df'''
def addTargetToDf(numDaysAhead, df, raw_df, target):
    
    targetDf1Col = pd.DataFrame()
#     print('Len of raw_df')
#     print(len(raw_df))
    # E.g. for 1-day ahead: the target (mean) for day0 (day on row 0) is day1's mean, target for day1 is day2's mean, etc
    targetDf1Col=(raw_df[target]).iloc[numDaysAhead:]
    # Drop last row bc obvs last row* numDaysAhead doesn't have the next numDaysAhead days to set as its target
#     print('##TargetDf1Col Length:')
#     print(len(targetDf1Col))
    df=df.copy().iloc[:-1 * numDaysAhead,:]
    df['Target: ' + target]=list(targetDf1Col)
    df=df.reset_index(drop=True)
    return df

In [36]:
# '''Add target to df'''
# def addTargetToDf(numDaysAhead, df, target):
    
#     targetDf1Col = pd.DataFrame()
#     # E.g. for 1-day ahead: the target (mean) for day0 (day on row 0) is day1's mean, target for day1 is day2's mean, etc
#     if target == 'Mean_temp':
#         targetDf1Col=((df['Min_temp']+df['Max_temp'])/2).iloc[numDaysAhead:]
#     else:
#         targetDf1Col=(df[target]).iloc[numDaysAhead:]
#     # Drop last row bc obvs last row* numDaysAhead doesn't have the next numDaysAhead days to set as its target
#     dfWTarget=df.copy().iloc[:-1 * numDaysAhead,:]
#     dfWTarget[target]=list(targetDf1Col)
#     dfWTarget=dfWTarget.reset_index(drop=True)
#     return dfWTarget

In [51]:
# @param numDaysAhead: the num of days in the future we want to predict
# @param numPrevDaysWeNeedInfoFor: number of days before curr date which we have info for to predict 
# Mean_temp of temperature 'numDaysAhead' days ahead''' 
def makeDfForTestAndTrain(df, target, features, numDaysAhead, numPrevDaysWeNeedInfoFor):

     
    dfDateAndFeatures = df[features]
    dfWEachRowHavingPrevXDaysData = createTimeShiftedDfFromFeatures(numPrevDaysWeNeedInfoFor, dfDateAndFeatures, features)
    # Need to drop first numPrevDaysWeNeedInfoFor rows since those were dropped in createTimeShiftedDfFromFeatures()
    # n length of target col must match len of dfWEachRowHavingPrevXDaysData for addTargetToDf() to work
    dfWTarget = addTargetToDf(numDaysAhead, dfWEachRowHavingPrevXDaysData, df.iloc[numPrevDaysWeNeedInfoFor:], target)
    #display(dfWTarget)
    # dfWTarget will have a len of len(raw_data) - numDaysAhead (last numDaysAhead rows need to be out)- numPrevDaysWeNeedInfoFor (first numPrevDaysWeNeedInfoFor will be out) 
    return dfWTarget

In [44]:
def split(xPercent, df):
    firstXRowsForTest = math.ceil(len(df)*(xPercent/100))
    #print(firstXRowsForTest)
    #Training data is dates up to right before 1400 row, Test data is dates from 1400th row
    # This can be adjusted/ played with
    feature_train=df.iloc[:firstXRowsForTest + 1, 1:-1]
    target_train=df.iloc[:firstXRowsForTest + 1, -1]
    feature_test=df.iloc[firstXRowsForTest + 1:, 1:-1]
    target_test=df.iloc[firstXRowsForTest + 1:, -1]
    groups = [feature_train, target_train, feature_test, target_test]
    return groups

# Creating train and test dataframes

In [49]:
train_df.columns.values.tolist()

['Max_temp', 'Min_temp', 'Wind_m/s', 'Precipitation_mm', 'Mean_temp']

In [70]:
FeaturesDict = {'all': ['Wind_m/s', 'Precipitation_mm', 'Min_temp', 'Max_temp', 'Mean_temp'], 'wind_rain' : ['Wind_m/s', 'Precipitation_mm'], 'onlytemp': ['Min_temp', 'Max_temp', 'Mean_temp']}
DayList = [1, 6, 29]

In [71]:
train_df.name = 'train'
test_df.name = 'test'

In [72]:
for i in [train_df, test_df]:
    for num in DayList:
        for k in FeaturesDict:
            numDaysAhead = 1
            numPrevDayData = num
            target = 'Mean_temp'
            features = FeaturesDict[k]
            makeDfForTestAndTrain(i, target, features, numDaysAhead, numPrevDayData).to_csv(f'preproc/prev{num}_{k}features_{i.name}.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colName+str(i+1)]=df.shift(i+1)[colName]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colName+str(i+1)]=df.shift(i+1)[colName]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colName+str(i+1)]=df.shift(i+1)[colName]
A value is trying to be set on a copy of a slice from a DataFrame.
Try us