# This script refines the dataframe, providing functions for outputting usable features/target test/train sets

In [927]:
import pandas as pd
import math

In [928]:
raw_weather_df = pd.read_csv('./data/bp_weather_data.csv')

In [929]:
raw_weather_df.rename(columns={'Max temperature (°C)': 'Max_temp'}, inplace=True)
raw_weather_df.rename(columns={'Min temperature (°C)': 'Min_temp'}, inplace=True)
raw_weather_df.rename(columns={'Wind (m/s)': 'Wind_m/s'}, inplace=True)
raw_weather_df.rename(columns={'Precipitation (mm)': "Precipitation_mm"}, inplace=True)
raw_weather_df = raw_weather_df.assign(Mean_temp = lambda x: (x['Min_temp'] + x['Max_temp'])/2)


In [930]:
raw_weather_df

Unnamed: 0,Date,Max_temp,Min_temp,Wind_m/s,Precipitation_mm,Mean_temp
0,2017.07.01,26.0,15.0,6.1,14.0,20.50
1,2017.07.02,25.0,15.0,6.9,0.0,20.00
2,2017.07.03,25.0,16.0,6.7,0.0,20.50
3,2017.07.04,27.0,12.0,5.3,0.0,19.50
4,2017.07.05,30.0,17.0,4.2,0.0,23.50
...,...,...,...,...,...,...
1944,2022.10.27,19.9,13.8,1.7,0.0,16.85
1945,2022.10.28,19.3,13.5,1.6,0.0,16.40
1946,2022.10.29,21.0,14.1,2.2,0.0,17.55
1947,2022.10.30,21.1,14.6,1.2,0.0,17.85


# Methods for creating TimeShifted Df

In [931]:
# def createTimeShiftedDfFromAllFeatures(numPrevDaysWeNeedInfoFor, df):
#     columns=['Max_temp', 'Min_temp', 'Wind_m/s', 'Precipitation_mm'] # change to df.getAllColumns
#     # Have each row contain weather data from last 7 days
#     for i in range(numPrevDaysWeNeedInfoFor):
#         for colName in columns:
#             # Shift down i+1 (if wanna shift up need to make it (-)) rows and place it in a just-created col all the way to the right 
#             # which gets name 'colName + (i+1)'
#             df[colName+str(i+1)]=df.shift(i+1)[colName]
#     df=df.dropna().reset_index(drop=True)
#     df.head()
#     return df

In [932]:
def createTimeShiftedDfFromFeatures(numPrevDaysWeNeedInfoFor, df, features):
    columns=features 
    # Have each row contain weather data from last 7 days
    for i in range(numPrevDaysWeNeedInfoFor):
        for colName in columns:
            # Shift down i+1 (if wanna shift up need to make it (-)) rows and place it in a just-created col all the way to the right 
            # which gets name 'colName + (i+1)'
            df[colName+str(i+1)]=df.shift(i+1)[colName]
    # print('#### Before drop na' + str(len(df))) # For debugging
    # of dropped columns will be numPrevDaysWeNeedInfoFor, since the first n days won't have all the numPrevDaysWeNeedInfoFor days' info
    df=df.dropna().reset_index(drop=True) 
    # print('#### After drop na' + str(len(df))) # For debugging
    return df

In [933]:
'''Add target to df'''
def addTargetToDf(numDaysAhead, df, raw_df, target):
    
    targetDf1Col = pd.DataFrame()
#     print('Len of raw_df')
#     print(len(raw_df))
    # E.g. for 1-day ahead: the target (mean) for day0 (day on row 0) is day1's mean, target for day1 is day2's mean, etc
    targetDf1Col=(raw_df[target]).iloc[numDaysAhead:]
    # Drop last row bc obvs last row* numDaysAhead doesn't have the next numDaysAhead days to set as its target
#     print('##TargetDf1Col Length:')
#     print(len(targetDf1Col))
    df=df.copy().iloc[:-1 * numDaysAhead,:]
    df['Target: ' + target]=list(targetDf1Col)
    df=df.reset_index(drop=True)
    return df

In [934]:
# '''Add target to df'''
# def addTargetToDf(numDaysAhead, df, target):
    
#     targetDf1Col = pd.DataFrame()
#     # E.g. for 1-day ahead: the target (mean) for day0 (day on row 0) is day1's mean, target for day1 is day2's mean, etc
#     if target == 'Mean_temp':
#         targetDf1Col=((df['Min_temp']+df['Max_temp'])/2).iloc[numDaysAhead:]
#     else:
#         targetDf1Col=(df[target]).iloc[numDaysAhead:]
#     # Drop last row bc obvs last row* numDaysAhead doesn't have the next numDaysAhead days to set as its target
#     dfWTarget=df.copy().iloc[:-1 * numDaysAhead,:]
#     dfWTarget[target]=list(targetDf1Col)
#     dfWTarget=dfWTarget.reset_index(drop=True)
#     return dfWTarget

'Add target to df'

In [935]:
# @param numDaysAhead: the num of days in the future we want to predict
# @param numPrevDaysWeNeedInfoFor: number of days before curr date which we have info for to predict 
# Mean_temp of temperature 'numDaysAhead' days ahead''' 
def makeDfForTestAndTrain(df, target, features, numDaysAhead, numPrevDaysWeNeedInfoFor):

    dateAndFeatures = ['Date'] + features 

    
    dfDateAndFeatures = raw_weather_df[dateAndFeatures]
    dfWEachRowHavingPrevXDaysData = createTimeShiftedDfFromFeatures(numPrevDaysWeNeedInfoFor, dfDateAndFeatures, features)
    # Need to drop first numPrevDaysWeNeedInfoFor rows since those were dropped in createTimeShiftedDfFromFeatures()
    # n length of target col must match len of dfWEachRowHavingPrevXDaysData for addTargetToDf() to work
    dfWTarget = addTargetToDf(numDaysAhead, dfWEachRowHavingPrevXDaysData, raw_weather_df.iloc[numPrevDaysWeNeedInfoFor:], target)
    display(dfWTarget)
    # dfWTarget will have a len of len(raw_data) - numDaysAhead (last numDaysAhead rows need to be out)- numPrevDaysWeNeedInfoFor (first numPrevDaysWeNeedInfoFor will be out) 
    return dfWTarget

In [936]:
def split(xPercent, df):
    firstXRowsForTest = math.ceil(len(df)*(xPercent/100))
    #print(firstXRowsForTest)
    #Training data is dates up to right before 1400 row, Test data is dates from 1400th row
    # This can be adjusted/ played with
    feature_train=df.iloc[:firstXRowsForTest + 1, 1:-1]
    target_train=df.iloc[:firstXRowsForTest + 1, -1]
    feature_test=df.iloc[firstXRowsForTest + 1:, 1:-1]
    target_test=df.iloc[firstXRowsForTest + 1:, -1]
    groups = [feature_train, target_train, feature_test, target_test]
    return groups

In [937]:
'''Sample implementation 1'''
# Define the desired Dataset-- this one is for predicting max from only other max values
numDaysAhead = 1
numPrevDayData = 5
target = 'Max_temp'
features = ['Max_temp']
df1dayAhead5DaysBeforeMax = makeDfForTestAndTrain(raw_weather_df, target, features, numDaysAhead, numPrevDayData)
# Split on desired dataframe
percentTrain = 80
x_train1, y_train1, x_test1, y_test1 = split(percentTrain, df1dayAhead5DaysBeforeMax)

Len of raw_df
1944
##TargetDf1Col Length:
1943


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colName+str(i+1)]=df.shift(i+1)[colName]


Unnamed: 0,Date,Max_temp,Max_temp1,Max_temp2,Max_temp3,Max_temp4,Max_temp5,Target: Max_temp
0,2017.07.06,30.0,30.0,27.0,25.0,25.0,26.0,28.00
1,2017.07.07,28.0,30.0,30.0,27.0,25.0,25.0,32.00
2,2017.07.08,32.0,28.0,30.0,30.0,27.0,25.0,30.00
3,2017.07.09,30.0,32.0,28.0,30.0,30.0,27.0,36.00
4,2017.07.10,36.0,30.0,32.0,28.0,30.0,30.0,31.48
...,...,...,...,...,...,...,...,...
1938,2022.10.26,19.9,19.8,18.8,19.4,18.4,16.0,19.90
1939,2022.10.27,19.9,19.9,19.8,18.8,19.4,18.4,19.30
1940,2022.10.28,19.3,19.9,19.9,19.8,18.8,19.4,21.00
1941,2022.10.29,21.0,19.3,19.9,19.9,19.8,18.8,21.10


In [938]:
x_train1

Unnamed: 0,Max_temp,Max_temp1,Max_temp2,Max_temp3,Max_temp4,Max_temp5
0,30.0,30.0,27.0,25.0,25.0,26.0
1,28.0,30.0,30.0,27.0,25.0,25.0
2,32.0,28.0,30.0,30.0,27.0,25.0
3,30.0,32.0,28.0,30.0,30.0,27.0
4,36.0,30.0,32.0,28.0,30.0,30.0
...,...,...,...,...,...,...
1551,23.8,22.2,20.4,17.1,15.9,19.9
1552,24.3,23.8,22.2,20.4,17.1,15.9
1553,16.7,24.3,23.8,22.2,20.4,17.1
1554,15.6,16.7,24.3,23.8,22.2,20.4


In [944]:
y_train1 # ?? Why no Col name while x_train1 does have?

0       28.00
1       32.00
2       30.00
3       36.00
4       31.48
        ...  
1551    24.30
1552    16.70
1553    15.60
1554    17.00
1555    15.20
Name: Target: Max_temp, Length: 1556, dtype: float64

In [939]:
''' Sample Implementation 2'''
numDaysAhead = 1
numPrevDayData = 5
target = 'Mean_temp'
features = raw_weather_df.columns.values.tolist()[1:]
df1dayAhead5DaysBeforeMean = makeDfForTestAndTrain(raw_weather_df, target, features, numDaysAhead, numPrevDayData)
# Split on desired dataframe
percentTrain = 80
x_train2, y_train2, x_test2, y_test2 = split(percentTrain, df1dayAhead5DaysBeforeMean)

Len of raw_df
1944
##TargetDf1Col Length:
1943


Unnamed: 0,Date,Max_temp,Min_temp,Wind_m/s,Precipitation_mm,Mean_temp,Max_temp1,Min_temp1,Wind_m/s1,Precipitation_mm1,...,Min_temp4,Wind_m/s4,Precipitation_mm4,Mean_temp4,Max_temp5,Min_temp5,Wind_m/s5,Precipitation_mm5,Mean_temp5,Target: Mean_temp
0,2017.07.06,30.0,17.0,5.6,0.0,23.50,30.0,17.0,4.2,0.0,...,15.0,6.9,0.0,20.00,26.0,15.0,6.1,14.0,20.50,22.50
1,2017.07.07,28.0,17.0,4.7,0.0,22.50,30.0,17.0,5.6,0.0,...,16.0,6.7,0.0,20.50,25.0,15.0,6.9,0.0,20.00,24.00
2,2017.07.08,32.0,16.0,6.7,0.0,24.00,28.0,17.0,4.7,0.0,...,12.0,5.3,0.0,19.50,25.0,16.0,6.7,0.0,20.50,24.00
3,2017.07.09,30.0,18.0,2.2,0.0,24.00,32.0,16.0,6.7,0.0,...,17.0,4.2,0.0,23.50,27.0,12.0,5.3,0.0,19.50,28.00
4,2017.07.10,36.0,20.0,6.4,21.0,28.00,30.0,18.0,2.2,0.0,...,17.0,5.6,0.0,23.50,30.0,17.0,4.2,0.0,23.50,26.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1938,2022.10.26,19.9,13.9,1.7,0.0,16.90,19.8,14.1,4.4,0.0,...,11.7,4.6,0.1,15.05,16.0,8.6,4.9,0.0,12.30,16.85
1939,2022.10.27,19.9,13.8,1.7,0.0,16.85,19.9,13.9,1.7,0.0,...,11.3,3.8,0.0,15.35,18.4,11.7,4.6,0.1,15.05,16.40
1940,2022.10.28,19.3,13.5,1.6,0.0,16.40,19.9,13.8,1.7,0.0,...,13.7,1.7,0.0,16.25,19.4,11.3,3.8,0.0,15.35,17.55
1941,2022.10.29,21.0,14.1,2.2,0.0,17.55,19.3,13.5,1.6,0.0,...,14.1,4.4,0.0,16.95,18.8,13.7,1.7,0.0,16.25,17.85


In [940]:
x_train2

Unnamed: 0,Max_temp,Min_temp,Wind_m/s,Precipitation_mm,Mean_temp,Max_temp1,Min_temp1,Wind_m/s1,Precipitation_mm1,Mean_temp1,...,Max_temp4,Min_temp4,Wind_m/s4,Precipitation_mm4,Mean_temp4,Max_temp5,Min_temp5,Wind_m/s5,Precipitation_mm5,Mean_temp5
0,30.0,17.0,5.6,0.0,23.50,30.0,17.0,4.2,0.0,23.50,...,25.0,15.0,6.9,0.0,20.00,26.0,15.0,6.1,14.0,20.50
1,28.0,17.0,4.7,0.0,22.50,30.0,17.0,5.6,0.0,23.50,...,25.0,16.0,6.7,0.0,20.50,25.0,15.0,6.9,0.0,20.00
2,32.0,16.0,6.7,0.0,24.00,28.0,17.0,4.7,0.0,22.50,...,27.0,12.0,5.3,0.0,19.50,25.0,16.0,6.7,0.0,20.50
3,30.0,18.0,2.2,0.0,24.00,32.0,16.0,6.7,0.0,24.00,...,30.0,17.0,4.2,0.0,23.50,27.0,12.0,5.3,0.0,19.50
4,36.0,20.0,6.4,21.0,28.00,30.0,18.0,2.2,0.0,24.00,...,30.0,17.0,5.6,0.0,23.50,30.0,17.0,4.2,0.0,23.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,23.8,13.9,3.3,0.0,18.85,22.2,12.5,5.4,0.0,17.35,...,15.9,13.1,3.9,10.1,14.50,19.9,15.6,3.8,0.0,17.75
1552,24.3,15.8,6.0,0.0,20.05,23.8,13.9,3.3,0.0,18.85,...,17.1,13.0,2.9,0.0,15.05,15.9,13.1,3.9,10.1,14.50
1553,16.7,11.9,4.4,12.7,14.30,24.3,15.8,6.0,0.0,20.05,...,20.4,13.1,5.3,0.0,16.75,17.1,13.0,2.9,0.0,15.05
1554,15.6,11.4,5.2,9.8,13.50,16.7,11.9,4.4,12.7,14.30,...,22.2,12.5,5.4,0.0,17.35,20.4,13.1,5.3,0.0,16.75
