In [1]:
import pandas as pd
import numpy as np
from datetime import timedelta 
import matplotlib.pyplot as plt



In [6]:
configfile_name = 'Tmap2.conf'
raw_data_path = '../data/raw_data/preprocessed/preprocessed_10288451_.csv'
idx_table_path = '../data/raw_data/tsd_mapping.csv'
model_output_path  = f"../log/result_summary/{configfile_name}/prediction_dir/prediction.npz"

In [7]:
idx_table = pd.read_csv(idx_table_path)
idx_table =  idx_table.set_index('idx')['tsdlink_id'].to_dict()

In [9]:
raw_data = pd.read_csv(raw_data_path)
raw_data = raw_data.loc[raw_data.tsdlinkid == idx_table[0]]

In [10]:
raw_data.shape

(137664, 30)

In [11]:
raw_data[raw_data.real_con==4.0].shape

(51107, 30)

In [5]:
model_output = np.load(model_output_path)

In [6]:
start_index = model_output['time_index'][0][0]

In [7]:
raw_data = raw_data.iloc[start_index:-1][['periodtime_1m','real_con','prod_con','prod_tt','real_tt', 'pat_tt','dt','row_idx']]

In [8]:
model_output['class_output'].shape

(27130, 10)

In [9]:
output = model_output['class_output']
gt = model_output['ground_truth'].squeeze()

In [10]:
raw_data['pred_class'] = output[:,0]
raw_data['gt'] = gt[:,0]

In [11]:
def transform_npz(df):

    df.index = pd.to_datetime(df['periodtime_1m'].astype('str'), format='%Y%m%d%H%M')
    df['year'] = df.index.year
    df['month'] = df.index.month
    df['day'] = df.index.day
    df['hour'] = df.index.hour
    df.loc[:,'hour_cate'] = (df['hour']/4).astype(int)
    

    return df

In [12]:
merged_data = transform_npz(raw_data)

In [13]:
merged_data.index

DatetimeIndex(['2020-02-27 19:05:00', '2020-02-27 19:10:00',
               '2020-02-27 19:15:00', '2020-02-27 19:20:00',
               '2020-02-27 19:25:00', '2020-02-27 19:30:00',
               '2020-02-27 19:35:00', '2020-02-27 19:40:00',
               '2020-02-27 19:45:00', '2020-02-27 19:50:00',
               ...
               '2020-05-31 23:05:00', '2020-05-31 23:10:00',
               '2020-05-31 23:15:00', '2020-05-31 23:20:00',
               '2020-05-31 23:25:00', '2020-05-31 23:30:00',
               '2020-05-31 23:35:00', '2020-05-31 23:40:00',
               '2020-05-31 23:45:00', '2020-05-31 23:50:00'],
              dtype='datetime64[ns]', name='periodtime_1m', length=27130, freq=None)

In [14]:
merged_data['lagged_real_con'] = merged_data.real_con.shift(-1)
merged_data['lagged2_real_con'] = merged_data.real_con.shift(-2)


In [15]:
merged_data

Unnamed: 0_level_0,periodtime_1m,real_con,prod_con,prod_tt,real_tt,pat_tt,dt,row_idx,pred_class,gt,year,month,day,hour,hour_cate,lagged_real_con,lagged2_real_con
periodtime_1m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
2020-02-27 19:05:00,202002271905,4.0,4.0,247.9200,376.1140,347.373930,20200227,110533,2,2.0,2020,2,27,19,4,4.0,4.0
2020-02-27 19:10:00,202002271910,4.0,4.0,295.6370,356.1670,307.176640,20200227,110534,2,2.0,2020,2,27,19,4,4.0,4.0
2020-02-27 19:15:00,202002271915,4.0,4.0,402.7870,309.8050,278.613830,20200227,110535,2,2.0,2020,2,27,19,4,4.0,4.0
2020-02-27 19:20:00,202002271920,4.0,4.0,342.1580,244.1630,265.741940,20200227,110536,2,2.0,2020,2,27,19,4,4.0,4.0
2020-02-27 19:25:00,202002271925,4.0,4.0,318.2130,216.6560,253.193470,20200227,110537,2,2.0,2020,2,27,19,4,4.0,4.0
2020-02-27 19:30:00,202002271930,4.0,4.0,240.1580,226.3960,229.363460,20200227,110538,2,2.0,2020,2,27,19,4,4.0,2.0
2020-02-27 19:35:00,202002271935,4.0,4.0,198.9640,188.4370,206.172440,20200227,110539,2,2.0,2020,2,27,19,4,2.0,1.0
2020-02-27 19:40:00,202002271940,2.0,4.0,203.2740,148.2180,175.357650,20200227,110540,2,0.0,2020,2,27,19,4,1.0,1.0
2020-02-27 19:45:00,202002271945,1.0,4.0,192.1150,73.1262,158.586530,20200227,110541,2,0.0,2020,2,27,19,4,1.0,1.0
2020-02-27 19:50:00,202002271950,1.0,2.0,105.3200,70.5932,151.636540,20200227,110542,2,0.0,2020,2,27,19,4,1.0,1.0


In [16]:
def find_change_point(df):
    df['change_point'] = np.where((df.real_con ==4.0) & (df.prod_con==1.0),1,0)
    tmp_df = df.loc[df.change_point==1]
    tmp_df['correcto'] = np.where(tmp_df.pred_class == 2,1,0)
    return tmp_df

In [35]:
def find_change_point(df):
    df['change_point'] = np.where((df.real_con !=4.0) & ((df.lagged_real_con == 4.0) | (df.lagged2_real_con == 4.0)),1,0)
    tmp_df = df.loc[df.change_point==1]
    tmp_df['correcto'] = np.where(tmp_df.pred_class == 2,1,0)
    return tmp_df
    
    

In [16]:
def find_change_point2(df):
    df['change_point'] = np.where((df.real_con ==4.0) & ((df.lagged_real_con != 4.0) | (df.lagged2_real_con != 4.0)),1,0)
    tmp_df = df.loc[df.change_point==1]
    tmp_df['correcto'] = np.where(tmp_df.pred_class == 2,1,0)
    return tmp_df

In [17]:
final_data =find_change_point2(merged_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [18]:
final_data

Unnamed: 0_level_0,periodtime_1m,real_con,prod_con,prod_tt,real_tt,pat_tt,dt,row_idx,pred_class,gt,year,month,day,hour,hour_cate,lagged_real_con,lagged2_real_con,change_point,correcto
periodtime_1m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2020-02-27 19:30:00,202002271930,4.0,4.0,240.1580,226.396,229.363460,20200227,110538,2,2.0,2020,2,27,19,4,4.0,2.0,1,1
2020-02-27 19:35:00,202002271935,4.0,4.0,198.9640,188.437,206.172440,20200227,110539,2,2.0,2020,2,27,19,4,2.0,1.0,1,1
2020-02-28 01:40:00,202002280140,4.0,1.0,69.8182,169.403,77.810356,20200228,110612,0,2.0,2020,2,28,1,0,1.0,1.0,1,0
2020-02-28 08:50:00,202002280850,4.0,4.0,289.9670,159.165,303.727230,20200228,110698,2,2.0,2020,2,28,8,2,4.0,1.0,1,1
2020-02-28 08:55:00,202002280855,4.0,4.0,210.5670,159.778,286.625400,20200228,110699,2,2.0,2020,2,28,8,2,1.0,1.0,1,1
2020-02-28 19:50:00,202002281950,4.0,4.0,345.7560,241.221,235.072920,20200228,110830,2,2.0,2020,2,28,19,4,4.0,2.0,1,1
2020-02-28 19:55:00,202002281955,4.0,4.0,323.9690,160.870,198.210540,20200228,110831,2,2.0,2020,2,28,19,4,2.0,2.0,1,1
2020-02-28 21:05:00,202002282105,4.0,4.0,274.3060,230.907,227.548680,20200228,110845,2,2.0,2020,2,28,21,5,4.0,2.0,1,1
2020-02-28 21:10:00,202002282110,4.0,4.0,274.3200,201.748,205.091600,20200228,110846,2,2.0,2020,2,28,21,5,2.0,1.0,1,1
2020-03-01 03:30:00,202003010330,4.0,1.0,69.8182,178.128,72.403650,20200301,111210,0,2.0,2020,3,1,3,0,1.0,1.0,1,0


In [19]:
wrong_pred = final_data.loc[final_data.correcto !=1]

In [20]:
right_pred = final_data.loc[final_data.correcto ==1]

In [21]:
wrong_pred.shape

(55, 19)

In [22]:
right_pred.shape

(455, 19)

In [23]:
per1 = 0
for i in range(wrong_pred.shape[0]):
    time = wrong_pred.index[i]+timedelta(minutes = 5)
    if merged_data.loc[time].pred_class ==2:
        per1 +=1
print(per1)

22


In [24]:
per2 = 0
for i in range(wrong_pred.shape[0]):
    time = wrong_pred.index[i]+timedelta(minutes = 10)
    if merged_data.loc[time].pred_class ==2:
        per2 +=1
print(per2)

31


In [26]:
wrong_pred.shape

(55, 19)

In [33]:
right_pred = final_data.loc[final_data.correcto ==1]

In [34]:
right_pred.shape

(139, 19)

In [64]:
raw_data.head()

Unnamed: 0_level_0,periodtime_1m,real_con,prod_con,prod_tt,real_tt,pat_tt,dt,row_idx,pred_class,gt,year,month,day,hour,hour_cate,lagged_real_con,lagged2_real_con,change_point
periodtime_1m,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2020-02-27 19:05:00,202002271905,4.0,4.0,247.92,376.114,347.37393,20200227,110533,2,2.0,2020,2,27,19,4,4.0,4.0,0
2020-02-27 19:10:00,202002271910,4.0,4.0,295.637,356.167,307.17664,20200227,110534,2,2.0,2020,2,27,19,4,4.0,4.0,0
2020-02-27 19:15:00,202002271915,4.0,4.0,402.787,309.805,278.61383,20200227,110535,2,2.0,2020,2,27,19,4,4.0,4.0,0
2020-02-27 19:20:00,202002271920,4.0,4.0,342.158,244.163,265.74194,20200227,110536,2,2.0,2020,2,27,19,4,4.0,4.0,0
2020-02-27 19:25:00,202002271925,4.0,4.0,318.213,216.656,253.19347,20200227,110537,2,2.0,2020,2,27,19,4,4.0,4.0,0


In [29]:
#ax = plt.gca()

for i in range(right_pred.shape[0]):
    
    start_tm = right_pred.index[i] - timedelta(minutes = 30)
    end_tm = right_pred.index[i] + timedelta(minutes = 30)
    year = right_pred.iloc[i].year
    month = right_pred.iloc[i].month
    day =  right_pred.iloc[i].day
    hour =  right_pred.iloc[i].hour
    minutes = str(int(right_pred.iloc[i].periodtime_1m))[-2:]
    #print(minutes)
    file_name = str(int(year)) + '_' + str(int(month))  + '_' + str(int(day)) +  '_' + str(int(hour)) + '-' + str(minutes)
    
    y_lim = raw_data[start_tm : end_tm][['prod_tt','real_tt','pat_tt']].max(axis=1).max(axis=0)
    #print('y_lim',y_lim)
    ax = raw_data[start_tm : end_tm][['pat_tt']].plot(style = '.--',color = 'black',ylim = (0,y_lim))
    raw_data[start_tm : end_tm][['prod_tt']].plot(ax =ax, style = '.--',color = 'red')
    raw_data[start_tm : end_tm][['real_tt']].plot(ax =ax, style = '.--',color = 'blue')
    ax.vlines(right_pred.index[i] ,ymin=0,ymax=100, label='test')
    #label_list = [
    #(wrong_pred.iloc[i].index - timedelta(minutes=5), 'decision_time', 'r')]
   
    #raw_data
    #plt.plot(kind = 'scatter', x = wrong_pred.iloc[i].index, y = wrong_pred.iloc[i].real_tt, ax = ax)
    #wrong_pred.iloc[i].prod_tt.plot(ax= ax, style = '.--',color ='red')
    
    fig = ax.get_figure() 
    #print(file_name)
    fig.savefig('./plot2/right_pred/' + file_name)
    plt.close(fig)
    plt.clf()

<Figure size 432x288 with 0 Axes>

In [13]:
date_range = list(set(raw_data.dt))
idx_mapping = {b:a for a,b in zip(raw_data.index, raw_data.row_idx )}
idx_inv_mapping = {a:b for a,b in zip(raw_data.index, raw_data.row_idx )}


In [None]:

plt.rcParams["figure.dpi"] = 72*3
def plot_data(data,mode):
    assert mode in ['train', 'test', 'val'], 'invalid mode'
    st_tm, ed_tm = extract_st_ed(mode)
    tm = st_tm   
    folder_path = '../plot/' 
    if os.path.exists(folder_path):
        shutil.rmtree(folder_path)
    else:
        pathlib.Path(folder_path).mkdir(parents = True, exist_ok=True)

    year_list = list(set(data.year))
    month_list = list(set(data.month))
    day_list =  list(set(data.day))
    hour_list =  list(set(data.hour))
    hour_cate_list =  list(set(data.hour_cate))

    #data['pat_tt'] =  data[tsd].loc[prediction_dict[tsd].index]['pat_tt']
    #data['prod_tt'] =  data[tsd].loc[prediction_dict[tsd].index]['prod_tt']
    #data['real_tt'] =  data[tsd].loc[prediction_dict[tsd].index]['real_tt']
    for year in year_list:

        for month in month_list:

            for day in day_list:


                    if data[(data.year==year) & (data.month==month) & (data.day==day)].empty:

                        continue

                    max_real = max(data[(data.year==year) & (data.month==month) & (data.day==day)].real_tt)
                    max_prod = max(data[(data.year==year) & (data.month==month) & (data.day==day)].prod_tt)
                    max_pat =  max(data[(data.year==year) & (data.month==month) & (data.day==day)].pat_tt)
                    y_lim =(0, max(max_real, max_prod, max_pat))

                    for hour_cate in hour_cate_list:

                        plot_data = data[(data.year==year) & (data.month==month) & (data.day==day) & (data.hour_cate == hour_cate) ]
                        if plot_data.empty:
                            continue
                        
                        file_name = str(tsd) + '__' + str(year) + '__' + str(month)  + '__' + str(day) +  '__' +  str(hour_cate)


                        #ax = plot_data[['pat_tt','prod_tt','real_tt']].plot(ylim = y_lim, style = '.--' )    
                        ax = plot_data[['pat_tt']].plot(ylim = y_lim, style = '.-', color = 'blue' ,figsize=(15,7))    
                        plot_data[['prod_tt']].plot(ax = ax , style = '.-' , color ='red')
                        plot_data[['real_tt']].plot(ax = ax  , style = '.-' , color ='black')

                        #ax2 = plot_data[['pat_tt','prod_tt','real_tt']].plot(ylim = y_lim, style = '.--' )      
                        #plot_data[['tic_0']].plot(ax= ax, style = '.--', ylim = y_lim)
                        #plot_data[['tic_1']].shift(1).plot(ax= ax, style = '.--', ylim = y_lim)
                        #plot_data[['tic_2']].shift(2).plot(ax= ax, style = '.--', ylim = y_lim)

                        #plot_data[['tic_0', 'tic_1', 'tic_2']].plot(ax= ax, style = '.--', ylim = y_lim)

                        #plot_data[['tic_3', 'tic_4', 'tic_5']].plot(ax= ax2)

                        fig = ax.get_figure()                                           
                        #fig2 = ax2.get_figure()
                        fig.savefig(folder_path + '/{}'.format(file_name + '_' + 'former'))
                        #fig2.savefig('../data/gcn/express/plot/{}'.format(file_name + '_' + 'latter'))
                        plt.close(fig)
                        #plt.close(fig2)
                        plt.clf()
            