In [52]:
import pandas as pd
import numpy as np
import os

## Time series data tranformation for use in the 1D CNN model for event type prediction

In [53]:
filename = os.getcwd()+'/data/oil_wells_data.csv'
df = pd.read_csv(filename)
# delete all-null columns
df = df.dropna(axis=1, how='all')
# delete timestamp column
df = df.drop(['timestamp'], axis=1)
df

Unnamed: 0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
0,27897490.0,19824230.0,125.685900,4059666.0,97.55283,0
1,27897450.0,19824230.0,125.685900,4059666.0,97.55283,0
2,27897360.0,19824230.0,125.685900,4059666.0,97.55283,0
3,27897430.0,19824230.0,125.685900,4059666.0,97.55282,0
4,27897500.0,19824230.0,125.685900,4059666.0,97.55282,0
...,...,...,...,...,...,...
26994,33700360.0,28074090.0,3.413923,4002707.0,28.00641,8
26995,33700220.0,28074160.0,3.413754,4002710.0,27.97934,8
26996,33700150.0,28074160.0,3.413741,4002710.0,27.95308,8
26997,33700210.0,28074220.0,3.413593,4002719.0,27.92784,8


Using the sliding window to create new instances that will be used for convolutions

In [54]:
timeframe = 40
stride = 1

In [55]:
new_instances = [] # list of windows for each column
new_labels = [] # list of windows for the label


for col in df.columns[:-1]:  # we don't want to create windows for the label
    # create windows by shifting from the bottom of every column
    windows = [df[col].shift(-i) for i in range(timeframe)] 
    
    # an instance is a set of windows for all columns
    # instances do not overlap
    # we only take instances that have all their values (exclude timeframe - 1 rows)
    instances = np.vstack(windows).T[:-timeframe+1:stride]
    
    new_instances.append(instances)

labels_windows = [df[df.columns[-1]].shift(-i) for i in range(timeframe)]
new_labels = np.vstack(labels_windows).T[:-timeframe+1:stride]

In [56]:
X = np.stack(new_instances, axis=-1)
y = new_labels

print("Dimensions of transformed instances:", X.shape)
print("Dimensions of transformed labels :", y.shape)

Dimensions des instances transformées : (26960, 40, 5)
Dimensions des labels transformés : (26960, 40)


In [57]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)


Shape of X_train: (21568, 40, 5)
Shape of X_test: (5392, 40, 5)
Shape of y_train: (21568, 40)
Shape of y_test: (5392, 40)


In [58]:
X_train_df = pd.DataFrame(X_train.reshape(X_train.shape[0], -1)) # Reshape X_test into 2D
y_df = pd.DataFrame(y_train)

X_train_file_path = "./data/X_train_transformed.csv"
y_file_path = "./data/y_train_transformed.csv"

X_train_df.to_csv(X_train_file_path, index=False, sep=' ')
y_df.to_csv(y_file_path, index=False, sep=' ')

print("Transformed instances saved to:", X_train_file_path)
print("Transformed labels saved to:", y_file_path)

Transformed instances saved to: ./data/X_train_transformed.csv
Transformed labels saved to: ./data/y_train_transformed.csv


In [59]:
X_test_df = pd.DataFrame(X_test.reshape(X_test.shape[0], -1))
y_df = pd.DataFrame(y_test)

X_test_file_path = "./data/X_test_transformed.csv"
y_file_path = "./data/y_test_transformed.csv"

X_test_df.to_csv(X_test_file_path, index=False, sep=' ')
y_df.to_csv(y_file_path, index=False, sep=' ')

print("Transformed instances saved to:", X_test_file_path)
print("Transformed labels saved to:", y_file_path)

Transformed instances saved to: ./data/X_test_transformed.csv
Transformed labels saved to: ./data/y_test_transformed.csv
