# This script refines the dataframe, providing functions for outputting usable features/target test/train sets

In [27]:
import pandas as pd
import math
from sklearn.preprocessing import StandardScaler

In [28]:
raw_weather_df = pd.read_csv('./data/bp_weather_data.csv')

In [29]:
raw_weather_df.rename(columns={'Max temperature (°C)': 'Max_temp'}, inplace=True)
raw_weather_df.rename(columns={'Min temperature (°C)': 'Min_temp'}, inplace=True)
raw_weather_df.rename(columns={'Wind (m/s)': 'Wind_m/s'}, inplace=True)
raw_weather_df.rename(columns={'Precipitation (mm)': "Precipitation_mm"}, inplace=True)
raw_weather_df = raw_weather_df.assign(Mean_temp = lambda x: (x['Min_temp'] + x['Max_temp'])/2)


In [30]:
raw_weather_df

Unnamed: 0,Date,Max_temp,Min_temp,Wind_m/s,Precipitation_mm,Mean_temp
0,2017.07.01,26.0,15.0,6.1,14.0,20.50
1,2017.07.02,25.0,15.0,6.9,0.0,20.00
2,2017.07.03,25.0,16.0,6.7,0.0,20.50
3,2017.07.04,27.0,12.0,5.3,0.0,19.50
4,2017.07.05,30.0,17.0,4.2,0.0,23.50
...,...,...,...,...,...,...
1944,2022.10.27,19.9,13.8,1.7,0.0,16.85
1945,2022.10.28,19.3,13.5,1.6,0.0,16.40
1946,2022.10.29,21.0,14.1,2.2,0.0,17.55
1947,2022.10.30,21.1,14.6,1.2,0.0,17.85


In [31]:
raw_weather_df_nodate = raw_weather_df.drop(['Date'], axis = 1)
raw_weather_df_nodate

Unnamed: 0,Max_temp,Min_temp,Wind_m/s,Precipitation_mm,Mean_temp
0,26.0,15.0,6.1,14.0,20.50
1,25.0,15.0,6.9,0.0,20.00
2,25.0,16.0,6.7,0.0,20.50
3,27.0,12.0,5.3,0.0,19.50
4,30.0,17.0,4.2,0.0,23.50
...,...,...,...,...,...
1944,19.9,13.8,1.7,0.0,16.85
1945,19.3,13.5,1.6,0.0,16.40
1946,21.0,14.1,2.2,0.0,17.55
1947,21.1,14.6,1.2,0.0,17.85


In [32]:
scaler = StandardScaler()
scaled_raw = scaler.fit_transform(raw_weather_df_nodate)
scaled_features_df = pd.DataFrame(scaled_raw, index = raw_weather_df_nodate.index, columns = raw_weather_df_nodate.columns)
scaled_features_df

Unnamed: 0,Max_temp,Min_temp,Wind_m/s,Precipitation_mm,Mean_temp
0,0.905276,0.644856,0.815365,3.094278,0.796250
1,0.805399,0.644856,1.227635,-0.410588,0.740746
2,0.805399,0.766945,1.124568,-0.410588,0.796250
3,1.005153,0.278590,0.403095,-0.410588,0.685242
4,1.304783,0.889034,-0.163776,-0.410588,1.129274
...,...,...,...,...,...
1944,0.296028,0.498350,-1.452120,-0.410588,0.391070
1945,0.236102,0.461723,-1.503654,-0.410588,0.341117
1946,0.405893,0.534976,-1.194452,-0.410588,0.468776
1947,0.415880,0.596021,-1.709789,-0.410588,0.502078


# Methods for creating TimeShifted Df

In [33]:
# def createTimeShiftedDfFromAllFeatures(numPrevDaysWeNeedInfoFor, df):
#     columns=['Max_temp', 'Min_temp', 'Wind_m/s', 'Precipitation_mm'] # change to df.getAllColumns
#     # Have each row contain weather data from last 7 days
#     for i in range(numPrevDaysWeNeedInfoFor):
#         for colName in columns:
#             # Shift down i+1 (if wanna shift up need to make it (-)) rows and place it in a just-created col all the way to the right 
#             # which gets name 'colName + (i+1)'
#             df[colName+str(i+1)]=df.shift(i+1)[colName]
#     df=df.dropna().reset_index(drop=True)
#     df.head()
#     return df

In [34]:
def createTimeShiftedDfFromFeatures(numPrevDaysWeNeedInfoFor, df, features):
    columns=features 
    # Have each row contain weather data from last 7 days
    for i in range(numPrevDaysWeNeedInfoFor):
        for colName in columns:
            # Shift down i+1 (if wanna shift up need to make it (-)) rows and place it in a just-created col all the way to the right 
            # which gets name 'colName + (i+1)'
            df[colName+str(i+1)]=df.shift(i+1)[colName]
    # print('#### Before drop na' + str(len(df))) # For debugging
    # of dropped columns will be numPrevDaysWeNeedInfoFor, since the first n days won't have all the numPrevDaysWeNeedInfoFor days' info
    df=df.dropna().reset_index(drop=True) 
    # print('#### After drop na' + str(len(df))) # For debugging
    return df

In [35]:
'''Add target to df'''
def addTargetToDf(numDaysAhead, df, raw_df, target):
    
    targetDf1Col = pd.DataFrame()
#     print('Len of raw_df')
#     print(len(raw_df))
    # E.g. for 1-day ahead: the target (mean) for day0 (day on row 0) is day1's mean, target for day1 is day2's mean, etc
    targetDf1Col=(raw_df[target]).iloc[numDaysAhead:]
    # Drop last row bc obvs last row* numDaysAhead doesn't have the next numDaysAhead days to set as its target
#     print('##TargetDf1Col Length:')
#     print(len(targetDf1Col))
    df=df.copy().iloc[:-1 * numDaysAhead,:]
    df['Target: ' + target]=list(targetDf1Col)
    df=df.reset_index(drop=True)
    return df

In [36]:
# '''Add target to df'''
# def addTargetToDf(numDaysAhead, df, target):
    
#     targetDf1Col = pd.DataFrame()
#     # E.g. for 1-day ahead: the target (mean) for day0 (day on row 0) is day1's mean, target for day1 is day2's mean, etc
#     if target == 'Mean_temp':
#         targetDf1Col=((df['Min_temp']+df['Max_temp'])/2).iloc[numDaysAhead:]
#     else:
#         targetDf1Col=(df[target]).iloc[numDaysAhead:]
#     # Drop last row bc obvs last row* numDaysAhead doesn't have the next numDaysAhead days to set as its target
#     dfWTarget=df.copy().iloc[:-1 * numDaysAhead,:]
#     dfWTarget[target]=list(targetDf1Col)
#     dfWTarget=dfWTarget.reset_index(drop=True)
#     return dfWTarget

In [37]:
# @param numDaysAhead: the num of days in the future we want to predict
# @param numPrevDaysWeNeedInfoFor: number of days before curr date which we have info for to predict 
# Mean_temp of temperature 'numDaysAhead' days ahead''' 
def makeDfForTestAndTrain(df, target, features, numDaysAhead, numPrevDaysWeNeedInfoFor):

     
    dfDateAndFeatures = scaled_features_df[features]
    dfWEachRowHavingPrevXDaysData = createTimeShiftedDfFromFeatures(numPrevDaysWeNeedInfoFor, dfDateAndFeatures, features)
    # Need to drop first numPrevDaysWeNeedInfoFor rows since those were dropped in createTimeShiftedDfFromFeatures()
    # n length of target col must match len of dfWEachRowHavingPrevXDaysData for addTargetToDf() to work
    dfWTarget = addTargetToDf(numDaysAhead, dfWEachRowHavingPrevXDaysData, scaled_features_df.iloc[numPrevDaysWeNeedInfoFor:], target)
    #display(dfWTarget)
    # dfWTarget will have a len of len(raw_data) - numDaysAhead (last numDaysAhead rows need to be out)- numPrevDaysWeNeedInfoFor (first numPrevDaysWeNeedInfoFor will be out) 
    return dfWTarget

In [38]:
def split(xPercent, df):
    firstXRowsForTest = math.ceil(len(df)*(xPercent/100))
    #print(firstXRowsForTest)
    #Training data is dates up to right before 1400 row, Test data is dates from 1400th row
    # This can be adjusted/ played with
    feature_train=df.iloc[:firstXRowsForTest + 1, 1:-1]
    target_train=df.iloc[:firstXRowsForTest + 1, -1]
    feature_test=df.iloc[firstXRowsForTest + 1:, 1:-1]
    target_test=df.iloc[firstXRowsForTest + 1:, -1]
    groups = [feature_train, target_train, feature_test, target_test]
    return groups

In [39]:
'''Sample implementation 1'''
# Define the desired Dataset-- this one is for predicting max from only other max values
numDaysAhead = 1
numPrevDayData = 5
target = 'Max_temp'
features = ['Max_temp']
df1dayAhead5DaysBeforeMax = makeDfForTestAndTrain(raw_weather_df, target, features, numDaysAhead, numPrevDayData)
# Split on desired dataframe
percentTrain = 80
x_train1, y_train1, x_test1, y_test1 = split(percentTrain, df1dayAhead5DaysBeforeMax)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colName+str(i+1)]=df.shift(i+1)[colName]


In [40]:
x_train1

Unnamed: 0,Max_temp1,Max_temp2,Max_temp3,Max_temp4,Max_temp5
0,1.304783,1.005153,0.805399,0.805399,0.905276
1,1.304783,1.304783,1.005153,0.805399,0.805399
2,1.105029,1.304783,1.304783,1.005153,0.805399
3,1.504536,1.105029,1.304783,1.304783,1.005153
4,1.304783,1.504536,1.105029,1.304783,1.304783
...,...,...,...,...,...
1551,0.525745,0.345967,0.016374,-0.103478,0.296028
1552,0.685547,0.525745,0.345967,0.016374,-0.103478
1553,0.735486,0.685547,0.525745,0.345967,0.016374
1554,-0.023577,0.735486,0.685547,0.525745,0.345967


In [41]:
y_train1 # ?? Why no Col name while x_train1 does have?

0       1.105029
1       1.504536
2       1.304783
3       1.904042
4       1.452600
          ...   
1551    0.735486
1552   -0.023577
1553   -0.133441
1554    0.006386
1555   -0.173392
Name: Target: Max_temp, Length: 1556, dtype: float64

In [42]:
''' Sample Implementation 2'''
numDaysAhead = 1
numPrevDayData = 5
target = 'Mean_temp'
features = raw_weather_df.columns.values.tolist()[1:]
df1dayAhead5DaysBeforeMean = makeDfForTestAndTrain(raw_weather_df, target, features, numDaysAhead, numPrevDayData)
# Split on desired dataframe
percentTrain = 80
x_train2, y_train2, x_test2, y_test2 = split(percentTrain, df1dayAhead5DaysBeforeMean)

In [43]:
x_train2

Unnamed: 0,Min_temp,Wind_m/s,Precipitation_mm,Mean_temp,Max_temp1,Min_temp1,Wind_m/s1,Precipitation_mm1,Mean_temp1,Max_temp2,...,Max_temp4,Min_temp4,Wind_m/s4,Precipitation_mm4,Mean_temp4,Max_temp5,Min_temp5,Wind_m/s5,Precipitation_mm5,Mean_temp5
0,0.889034,0.557696,-0.410588,1.129274,1.304783,0.889034,-0.163776,-0.410588,1.129274,1.005153,...,0.805399,0.644856,1.227635,-0.410588,0.740746,0.905276,0.644856,0.815365,3.094278,0.796250
1,0.889034,0.093892,-0.410588,1.018266,1.304783,0.889034,0.557696,-0.410588,1.129274,1.304783,...,0.805399,0.766945,1.124568,-0.410588,0.796250,0.805399,0.644856,1.227635,-0.410588,0.740746
2,0.766945,1.124568,-0.410588,1.184778,1.105029,0.889034,0.093892,-0.410588,1.018266,1.304783,...,1.005153,0.278590,0.403095,-0.410588,0.685242,0.805399,0.766945,1.124568,-0.410588,0.796250
3,1.011123,-1.194452,-0.410588,1.184778,1.504536,0.766945,1.124568,-0.410588,1.184778,1.105029,...,1.304783,0.889034,-0.163776,-0.410588,1.129274,1.005153,0.278590,0.403095,-0.410588,0.685242
4,1.255301,0.969966,4.846710,1.628811,1.304783,1.011123,-1.194452,-0.410588,1.184778,1.504536,...,1.304783,0.889034,0.557696,-0.410588,1.129274,1.304783,0.889034,-0.163776,-0.410588,1.129274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1551,0.510558,-0.627580,-0.410588,0.613086,0.525745,0.339634,0.454629,-0.410588,0.446574,0.345967,...,-0.103478,0.412887,-0.318378,2.117922,0.130201,0.296028,0.718110,-0.369911,-0.410588,0.490978
1552,0.742527,0.763831,-0.410588,0.746296,0.685547,0.510558,-0.627580,-0.410588,0.613086,0.525745,...,0.016374,0.400678,-0.833715,-0.410588,0.191256,-0.103478,0.412887,-0.318378,2.117922,0.130201
1553,0.266381,-0.060709,2.768826,0.108000,0.735486,0.742527,0.763831,-0.410588,0.746296,0.685547,...,0.345967,0.412887,0.403095,-0.410588,0.379969,0.016374,0.400678,-0.833715,-0.410588,0.191256
1554,0.205336,0.351561,2.042818,0.019193,-0.023577,0.266381,-0.060709,2.768826,0.108000,0.735486,...,0.525745,0.339634,0.454629,-0.410588,0.446574,0.345967,0.412887,0.403095,-0.410588,0.379969


In [44]:
''' Sample Implementation 3'''
numDaysAhead = 1
numPrevDayData = 5
target = 'Mean_temp'
features = ['Wind_m/s', "Precipitation_mm"]
df1dayAhead5DaysBeforeMean = makeDfForTestAndTrain(raw_weather_df, target, features, numDaysAhead, numPrevDayData)
# Split on desired dataframe
percentTrain = 80
x_train2, y_train2, x_test2, y_test2 = split(percentTrain, df1dayAhead5DaysBeforeMean)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colName+str(i+1)]=df.shift(i+1)[colName]


In [45]:
raw_weather_df.columns.values.tolist()

['Date', 'Max_temp', 'Min_temp', 'Wind_m/s', 'Precipitation_mm', 'Mean_temp']

In [46]:
TargetsList = ['Min_temp', 'Max_temp', 'Mean_temp']
FeaturesDict = {'all': ['Wind_m/s', 'Precipitation_mm', 'Min_temp', 'Max_temp', 'Mean_temp'], 'wind_rain' : ['Wind_m/s', 'Precipitation_mm'], 'onlytemp': ['Min_temp', 'Max_temp', 'Mean_temp']}

In [47]:
for i in range (8):
    for j in TargetsList:
        for k in FeaturesDict:
            numDaysAhead = 1
            numPrevDayData = i
            target = j
            features = FeaturesDict[k]
            makeDfForTestAndTrain(scaled_features_df, target, features, numDaysAhead, numPrevDayData).to_csv(f'preprocessed/{i}daysprev_{k}features_{j}.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colName+str(i+1)]=df.shift(i+1)[colName]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colName+str(i+1)]=df.shift(i+1)[colName]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colName+str(i+1)]=df.shift(i+1)[colName]
A value is trying to be set on a copy of a slice from a DataFrame.
Try us