In [1]:
#Kernel fastai
import pandas as pd
import tsfresh as ts
import numpy as np
from imblearn.ensemble import BalancedRandomForestClassifier

In [None]:
df = pd.read_csv('E:/Data/OAPMLData/V1.1_CAIC_UAC_NWAC_FeaturesAsTimeSeries20131101To20180430.csv', parse_dates=['__fileDate'], low_memory=False)

In [None]:
df.head()

In [None]:
def identify_season(row):
    year = row['__fileDate'].year
    month = row['__fileDate'].month
    if year == 2013 or (year == 2014 and month < 5):
        return '13-14'
    elif (year == 2014 and month > 10) or (year == 2015 and month < 5):
        return '14-15'
    elif (year == 2015 and month > 10) or (year == 2016 and month < 5):
        return '15-16'
    elif (year == 2016 and month > 10) or (year == 2017 and month < 5):
        return '16-17'
    elif (year == 2017 and month > 10) or (year == 2018 and month < 5):
        return '17-18'
    else:
        return 'unknown season'

In [None]:
df['Season'] = df.apply(identify_season, axis=1)

In [None]:
#clean mising values
mappingMissingValues = {-9999:np.nan}

df = df.replace(mappingMissingValues)
filter = df[['Season', 'Lat', 'Lon']].drop_duplicates()

listOfFrames = []
for f in filter.iterrows():
    listOfFrames.append(df[(df['Season'] == f[1]['Season']) & (df['Lat'] == f[1]['Lat']) & (df['Lon'] == f[1]['Lon'])].interpolate(method='linear', axis=0).ffill().bfill())

interpolated = pd.concat(listOfFrames)

#fill in any remaining nan with 0 as that datapoint was probably missing for a long
#period of time (all of the sesason for the lat/lon)

interpolated.fillna(0, inplace=True) 

df=interpolated


In [None]:
df.head()

In [None]:
input_cols = ['APCPSurface', 'MaxTempSurfaceF',
              'MinTempSurfaceF', 'AvgTempSurfaceF', 'MaxTemp2mAboveGroundF',
              'MinTemp2mAboveGroundF', 'AvgTemp2mAboveGroundF',
              'MaxTemp80mAboveGroundF', 'MinTemp80mAboveGroundF',
              'AvgTemp80mAboveGroundF', 'MaxTempTropF', 'MinTempTropF',
              'AvgTempTropF', 'AvgRH2mAboveGround', 'AvgWindDirection10m',
              'AvgWindDirection80m', 'AvgWindDirectionTrop', 'AvgWindSpeed10m',
              'MaxWindSpeed10m', 'AvgWindSpeed80m', 'MaxWindSpeed80m',
              'AvgWindSpeedTrop', 'MaxWindSpeedTrop',                
              'SnowWaterEquivalentIn', 'PrecipIncrementSnowIn',
              'PrecipitationAccumulation', 'SnowDepthIn', 'TempMinF', 'TempMaxF',
              'TempAveF', 'SNOWDAS_SnowDepth_mm', 'SNOWDAS_SWE_mm',
              'SNOWDAS_SnowmeltRunoff_micromm', 'SNOWDAS_Sublimation_micromm',
              'SNOWDAS_SublimationBlowing_micromm',
              'SNOWDAS_SolidPrecip_kgpersquarem',
              'SNOWDAS_LiquidPrecip_kgpersquarem', 'SNOWDAS_SnowpackAveTemp_k',
              'c_IsCoastalSnowpack', 'c_IsContenentalSnowpack']
output_cols = ['o_Day1DangerAboveTreeline', 'o_Day1DangerNearTreeline', 'o_Day1DangerBelowTreeline']
metadata_cols = ['Lat', 'Lon', '__fileDate', 'c_IsCoastalSnowpack', 'c_IsContenentalSnowpack', 'UnifiedRegion']

df_X = df[input_cols]
df_y = df[output_cols + metadata_cols]

In [None]:
df_X['id'] = (df['Lat']*100000).astype(int).apply(str) + (df['Lon']*-100000).astype(int).apply(str) + df['Season']

In [None]:
df_X['date'] = df['__fileDate']

In [None]:
df_X = df_X.reset_index(drop=True)

In [None]:
df_y = df_y.reset_index(drop=True)

In [None]:
df_y.index = df_X['id']

In [None]:
df_X[df_X['id']=='45325031217681113-14'].head()

In [None]:
df_X.to_pickle('E:/Temp/ts_df_X.pkl')
df_y.to_pickle('E:/Temp/ts_df_y.pkl')

In [None]:
df_X = pd.read_pickle('E:/Temp/ts_df_X.pkl')
df_y = pd.read_pickle('E:/Temp/ts_df_y.pkl')

In [None]:
#debug
unique_ids = df_X['id'].unique()
tmp = ts.utilities.dataframe_functions.roll_time_series(df_X[df_X['id']==unique_ids[0]], column_id='id', column_sort='date', column_kind=None, rolling_direction=1)


In [None]:
#debug
tmp.head()

In [None]:
#debug
tmp_y = df_y[df_y.index==unique_ids[0]]
tmp_y.index = tmp_y['__fileDate'].astype(str) + '-' + unique_ids[0]

In [None]:
tmp_y.head()

In [None]:
tmp.to_csv("testroll.csv")

In [None]:
unique_ids = df_X['id'].unique()
result_x = []
result_y = []
count = 0

for i in unique_ids[count:1]:
    print('Iteration: ' + str(count) + ' of ' + str(len(unique_ids)))
    count += 1
    tmp = ts.utilities.dataframe_functions.roll_time_series(df_X[df_X['id']==i], column_id='id', column_sort='date', column_kind=None, rolling_direction=1)
    tmp_ext = ts.extract_features(tmp, column_id='id', column_sort='date', n_jobs=8, chunksize=400, show_warnings=False)
    tmp_ext.index = pd.Series(tmp_ext.index.format()) + '-' + i
    tmp_y = df_y[df_y.index==i]
    tmp_y.index = tmp_y['__fileDate'].astype(str) + '-' + i
    tmp_ext.to_pickle('E:/Temp/time-series/ext_' + i + '.pkl')
    tmp_y.to_pickle('E:/Temp/time-series/y_' + i + '.pkl')
    #result_x.append(tmp_ext)
    #result_y.append(tmp_y)

In [None]:
def read_and_impute(f):
    try:  
        return ts.utilities.dataframe_functions.impute(pd.read_pickle('E:/Temp/time-series/' + f))
    except:
        print('Exception on file ' + f)
        

In [None]:
%%time
from joblib import Parallel, delayed
from os import listdir

file = ['13-14.pkl', '14-15.pkl', '15-16.pkl', '16-17.pkl', '17-18.pkl']
files = []
i = 4
#for i in range(0, len(file)):
ext_files = [f for f in listdir('E:/Temp/time-series/') if f.startswith('ext_') and (f.endswith(file[i]))]
files = Parallel(n_jobs=10)(delayed(read_and_impute)(f) for f in ext_files)

In [None]:
ext_df = pd.concat(files)

In [None]:
del files

In [None]:
%%time
y_files = [f for f in listdir('E:/Temp/time-series/') if f.startswith('y_')  and (f.endswith(file[i]))]
yfiles = []
for f in y_files:
    try:
        yfiles.append(pd.read_pickle('E:/Temp/time-series/' + f))
    except:
        print('Exception on file ' + f)

y_df = pd.concat(yfiles)

In [None]:
#%%time
#TODO: convert to DASK
#from os import listdir
#year = ['14', '15', '16', '17', '18']
#file = ['13-14.pkl', '14-15.pkl', '15-16.pkl', '16-17.pkl', '17-18.pkl']

#i = 0
#for i in range(0, len(file)):
#ext_files = [f for f in listdir('E:/Temp/time-series/') if f.startswith('ext_') and (f.endswith(file[i]))] # or f.endswith('16.pkl'))]
#y_files = [f for f in listdir('E:/Temp/time-series/') if f.startswith('y_')  and (f.endswith(file[i]))] # or f.endswith('16.pkl'))]
#files = []
#for f in ext_files:
#    try:        
#        files.append(ts.utilities.dataframe_functions.impute(pd.read_pickle('E:/Temp/time-series/' + f)))
#    except Exception:
#        print('Exception on file ' + f)

#ext_df = pd.concat(files)
#yfiles = []
#for f in y_files:
#    yfiles.append(pd.read_pickle('E:/Temp/time-series/' + f))

#y_df = pd.concat(yfiles)
 

In [None]:
len(ext_files)

In [None]:
y_df.shape

In [None]:
ext_df.head()

In [None]:
y_df.head()

In [None]:
y_df['o_Day1DangerAboveTreeline'].value_counts()

In [None]:
del yfiles

In [None]:
ext_df.sort_index(inplace=True)
y_df.sort_index(inplace=True)

y_df = y_df[y_df['o_Day1DangerAboveTreeline']!='Extreme']
y_df = y_df[y_df['o_Day1DangerAboveTreeline']!='no-data']
y_df = y_df[y_df['o_Day1DangerAboveTreeline']!=0]
y_df = y_df['o_Day1DangerAboveTreeline'].dropna()
ext_df = ext_df.reindex(y_df.index)

In [None]:
y_df.value_counts()

In [None]:
ext_df.to_parquet('E:\Temp\df_rolled_x_17-18.par')
y_df.to_pickle('E:\Temp\df_rolled_y_17-18.pkl')

In [None]:
ext_df = pd.read_parquet('E:\Temp\df_rolled_x_14-15.par')
y_df = pd.read_pickle('E:\Temp\df_rolled_y_14-15.pkl')

In [None]:
y_df.value_counts()

In [None]:
#y_df_sampled = y_df.sample(frac=.25)

In [None]:
#del ext_df_samples
#del ext_df_filtered

In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
X_resampled, y_resampled = rus.fit_resample(ext_df, y_df)

In [None]:
pd.Series(y_resampled).value_counts()

In [None]:
#ext_df_samples = ext_df.reindex(y_df_sampled.index)

In [None]:
X_resampled = pd.DataFrame(X_resampled, columns=ext_df.columns)
#y_resampled = pd.Series(y_resampled, ec)

In [None]:
ext_df_filtered = ts.select_features(X_resampled, y_resampled, n_jobs=8)

In [None]:
ext_df_filtered.shape

In [None]:
pd.Series(ext_df_filtered.columns).to_csv("E:/Temp/17-18tsfreshfeatrues.csv")

In [None]:
files = ['13-14tsfreshfeatures.csv','14-15tsfreshfeatures.csv','15-16tsfreshfeatures.csv','16-17tsfreshfeatures.csv','17-18tsfreshfeatures.csv']
dfs = []
for f in files:
    dfs.append(pd.read_csv('E:/Temp/' + f, header=None)[1])



In [None]:
dfs = pd.concat(dfs)

In [None]:
columns = dfs.unique()

In [None]:
files_X = ['df_rolled_x_13-14.par','df_rolled_x_14-15.par','df_rolled_x_15-16.par','df_rolled_x_16-17.par','df_rolled_x_17-18.par']

files_y = ['df_rolled_y_13-14.pkl','df_rolled_y_14-15.pkl','df_rolled_y_15-16.pkl','df_rolled_y_16-17.pkl','df_rolled_y_17-18.pkl']


In [None]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=0)
#X_resampled, y_resampled = rus.fit_resample(ext_df, y_df)

dfs_X = []
dfs_y = []

#for f_i in range(0,len(files_X)):
f_i = 0
tmp_X = pd.read_parquet('E:/Temp/' + files_X[f_i], columns=columns)
tmp_y = pd.read_pickle('E:/Temp/' + files_y[f_i])
tmp_X_resampled, tmp_y_resampled = rus.fit_resample(tmp_X, tmp_y)
dfs_X.append(tmp_X_resampled)
dfs_y.append(tmp_y_resampled)
del tmp_X
del tmp_y
    

In [None]:
f_i = 3
tmp_X = pd.read_parquet('E:/Temp/' + files_X[f_i], columns=columns)
tmp_y = pd.read_pickle('E:/Temp/' + files_y[f_i])
tmp_X_resampled, tmp_y_resampled = rus.fit_resample(tmp_X, tmp_y)
dfs_X.append(tmp_X_resampled)
dfs_y.append(tmp_y_resampled)
del tmp_X
del tmp_y

In [None]:
pdfs_X = [pd.DataFrame(i, columns=columns) for i in dfs_X]
pdfs_y = [pd.Series(i) for i in dfs_y]

In [None]:
tmp_filtered = pd.concat(pdfs_X)
df_rolled_y_sampled2 = pd.concat(pdfs_y)

In [None]:
tmp_filtered.shape

In [None]:
df_rolled_y_sampled2.shape

In [None]:
f_i = 4
df_rolled_x_sampled_test = pd.read_parquet('E:/Temp/' + files_X[f_i], columns=columns)
df_rolled_y_sampled2_test = pd.read_pickle('E:/Temp/' + files_y[f_i])

In [None]:
#df_rolled_x.to_pickle('E:\Temp\df_rolled_x2016.pkl')
#df_rolled_y.to_pickle('E:\Temp\df_rolled_y2016.pkl')

In [None]:
#df_rolled_x_sampled = df_rolled_x.sample(frac=.2)

In [None]:
#df_rolled_x_sampled_test = df_rolled_x.sample(frac=.2)

In [None]:
#df_rolled_x_sampled.shape

In [None]:
#df_rolled_y_sampled_test = df_rolled_y.reindex(df_rolled_x_sampled_test.index)

In [None]:
#del df_rolled_x
#del df_rolled_y

In [None]:
#del ext_df
#del y_df

In [None]:
#tmp = []
#num_cols = 1000
#for i in range(0, int(len(df_rolled_x.columns)), num_cols):
#    impute_me = df_rolled_x.iloc[:, i:i+num_cols].copy()
    
#    tmp.append(impute_me.reindex(df_rolled_y_sampled2_test.index))

In [None]:
#del df_rolled_x
#del ext_df

In [None]:
#tmp2 = pd.concat(tmp, axis=1)

In [None]:
#del tmp

In [None]:
#tmp2.shape

In [None]:
files_x = ['df_rolled_x_13-14.pkl','df_rolled_x_14-15.pkl','df_rolled_x_15-16.pkl', 'df_rolled_x_16-17.pkl']
files_y = ['df_rolled_y_13-14.pkl','df_rolled_y_14-15.pkl','df_rolled_y_15-16.pkl','df_rolled_y_16-17.pkl']
files_array_x = []
files_array_y = []
for f in files_x:
    files_array_x.append(pd.read_pickle('E:/Temp/' + f ))
    
for f in files_y:
    files_array_y.append(pd.read_pickle('E:/Temp/' + f ))

df_rolled_x_sampled = pd.concat(files_array_x)
df_rolled_y_sampled2 = pd.concat(files_array_y)


df_rolled_x_sampled_test = pd.read_pickle('E:/Temp/df_rolled_x_17-18.pkl')
df_rolled_y_sampled2_test = pd.read_pickle('E:/Temp/df_rolled_y_17-18.pkl')

In [None]:
del files_array_x
del files_array_y

In [None]:
df_rolled_x_sampled.head()

In [None]:
df_rolled_y_sampled2.value_counts()

In [None]:
df_rolled_x_sampled_test.shape

In [None]:
df_rolled_y_sampled2_test.shape

In [None]:
tmp_filtered.head()

In [None]:
del df_rolled_x_sampled

In [None]:
tmp_filtered.to_pickle('E:\Temp\df_rolled_x_selected_features.pkl')

In [None]:
from sklearn.ensemble import RandomForestClassifier

m = RandomForestClassifier(n_jobs = 4,
                           #oob_score=True,
                           n_estimators=50,
                           #max_features="sqrt",
                           min_samples_leaf=100
                           )
m.fit(tmp_filtered.values, df_rolled_y_sampled2.values.ravel())
m.score(tmp_filtered.values, df_rolled_y_sampled2.values.ravel())

In [None]:
m.score(df_rolled_x_sampled_test.values, df_rolled_y_sampled2_test.values.ravel())

In [None]:
import error_evaluation
import matplotlib
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
preds = m.predict(df_rolled_x_sampled_test.values)

error_evaluation.evaluateSingleClassShort(df_rolled_y_sampled2_test, preds)
cnf_matrix = confusion_matrix(df_rolled_y_sampled2_test, preds)
plt.figure()
error_evaluation.plot_confusion_matrix(cnf_matrix, classes=['Low', 'Moderate', 'Considerable', 'High'], normalize=True,
                      title='Normalized confusion matrix')

plt.figure()
error_evaluation.plot_confusion_matrix(cnf_matrix, classes=['Low', 'Moderate', 'Considerable', 'High'],
                      title='Confusion matrix, without normalization')

In [None]:
preds = m.predict(tmp_filtered.values)

error_evaluation.evaluateSingleClassShort(df_rolled_y_sampled2, preds)
cnf_matrix = confusion_matrix(df_rolled_y_sampled2, preds)
plt.figure()
error_evaluation.plot_confusion_matrix(cnf_matrix, classes=['Low', 'Moderate', 'Considerable', 'High'], normalize=True,
                      title='Normalized confusion matrix')

plt.figure()
error_evaluation.plot_confusion_matrix(cnf_matrix, classes=['Low', 'Moderate', 'Considerable', 'High'],
                      title='Confusion matrix, without normalization')

In [None]:
importances = pd.Series(m.feature_importances_)
out = pd.concat([pd.Series(tmp_filtered.columns), importances], axis=1)
out.columns=['Feature','Score']

In [None]:
important_features = out.sort_values(by=['Score'], ascending=False)[:1000]['Feature']

In [None]:
pd.Series(important_features).to_csv('E:/Temp/top1000tsfreshfeatures.csv', index=False)

In [None]:
important_features[:20]

In [None]:
m = BalancedRandomForestClassifier(n_jobs = 4,
                           #oob_score=True,
                           n_estimators=50,
                           #max_features="sqrt",
                           min_samples_leaf=100
                           )
m.fit(tmp_filtered[important_features].values, df_rolled_y_sampled2.values.ravel())
m.score(tmp_filtered[important_features].values, df_rolled_y_sampled2.values.ravel())

In [None]:
preds = m.predict(df_rolled_x_sampled_test[important_features].values)

error_evaluation.evaluateSingleClassShort(df_rolled_y_sampled2_test, preds)
cnf_matrix = confusion_matrix(df_rolled_y_sampled2_test, preds)
plt.figure()
error_evaluation.plot_confusion_matrix(cnf_matrix, classes=['Low', 'Moderate', 'Considerable', 'High'], normalize=True,
                      title='Normalized confusion matrix')

plt.figure()
error_evaluation.plot_confusion_matrix(cnf_matrix, classes=['Low', 'Moderate', 'Considerable', 'High'],
                      title='Confusion matrix, without normalization')

In [3]:
import DataPrep
import error_evaluation
import pandas as pd
dp = DataPrep.DataPrep()
X_Above_test, X_Above_train, y_Above_test, y_Above_train, X_Near_test, X_Near_train, y_Near_test, y_Near_train, X_Below_test, X_Below_train, y_Below_test, y_Below_train = dp.prep_day1_danger_train_test(input_file='E:/Data/OAPMLData/V1.1CIAC_UAC_NWAC_FeaturesWithLabels30Days20131201To20180430.csv', ignore_extreme=True, only_critical_points=False, oversample=False, only_precise_points = False, label_critical_points=True)


ModuleNotFoundError: No module named 'keras'