In [12]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

## Time series data tranformation for use in the 1D CNN model for event type prediction

In [28]:
filename = os.getcwd()+'/data/oil_wells_data.csv'
df = pd.read_csv(filename)
# delete all-null columns
df = df.dropna(axis=1, how='all')
# delete timestamp column
df = df.drop(['timestamp'], axis=1)
df

Unnamed: 0,P-PDG,P-TPT,T-TPT,P-MON-CKP,T-JUS-CKP,class
0,27897490.0,19824230.0,125.685900,4059666.0,97.55283,0
1,27897450.0,19824230.0,125.685900,4059666.0,97.55283,0
2,27897360.0,19824230.0,125.685900,4059666.0,97.55283,0
3,27897430.0,19824230.0,125.685900,4059666.0,97.55282,0
4,27897500.0,19824230.0,125.685900,4059666.0,97.55282,0
...,...,...,...,...,...,...
26994,33700360.0,28074090.0,3.413923,4002707.0,28.00641,8
26995,33700220.0,28074160.0,3.413754,4002710.0,27.97934,8
26996,33700150.0,28074160.0,3.413741,4002710.0,27.95308,8
26997,33700210.0,28074220.0,3.413593,4002719.0,27.92784,8


Normalizing the data is important for the 1D CNN model to work properly. We have 5 attributes with totally different scales. Without normalization, the model will be biased towards the attributes with larger scales (will learn larger weights for them).

Before normalization, we will first split the data into training and validation sets in order to avoid data leakage between the sets. Then we will create new instances of the time series data by sliding a window over the data. Only after this we will normalize the data.

In order for the classes to be balanced both in the training and test sets, we will use stratified splitting.

In [29]:
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_index, test_index in split.split(df, df['class']):
    strat_train_set = df.loc[train_index]
    strat_test_set = df.loc[test_index]

In [30]:
strat_test_set['class'].value_counts() / len(strat_test_set)

class
8      0.592593
108    0.340741
0      0.066667
Name: count, dtype: float64

In [31]:
df['class'].value_counts() / len(df)

class
8      0.592578
108    0.340753
0      0.066669
Name: count, dtype: float64

Using the sliding window to create new instances that will be used for convolutions

In [32]:
timeframe = 40
stride = 1

In [33]:
def sliding_window(df, timeframe=40, stride=1):
    new_instances = [] # list of windows for each column
    new_labels = [] # list of windows for the label

    for col in df.columns[:-1]:  # we don't want to create windows for the label
        # create windows by shifting from the bottom of every column
        windows = [df[col].shift(-i) for i in range(timeframe)] 
        
        # an instance is a set of windows for all columns
        # instances do not overlap
        # we only take instances that have all their values (exclude timeframe - 1 rows)
        instances = np.vstack(windows).T[:-timeframe+1:stride]
        new_instances.append(instances)

    labels_windows = [df[df.columns[-1]].shift(-i) for i in range(timeframe)]
    new_labels = np.vstack(labels_windows).T[:-timeframe+1:stride]

    return np.stack(new_instances, axis=-1), new_labels



In [34]:
X_train, y_train = sliding_window(strat_train_set)

X_test, y_test = sliding_window(strat_test_set)

print("Dimensions of transformed training instances:", X_train.shape)
print("Dimensions of transformed training labels :", y_train.shape)
print("Dimensions of transformed test instances:", X_test.shape)
print("Dimensions of transformed test labels :", y_test.shape)

Dimensions of transformed training instances: (21560, 40, 5)
Dimensions of transformed training labels : (21560, 40)
Dimensions of transformed test instances: (5361, 40, 5)
Dimensions of transformed test labels : (5361, 40)


In [45]:
y_train.shape

(21560, 40)

Normalization

In [54]:
from sklearn.preprocessing import MinMaxScaler

# create a MinMaxScaler instance
scaler = MinMaxScaler()

# reshape X_train to 2D array (instances, features)
X_train_reshaped = X_train.reshape(-1, X_train.shape[-1])

X_train_normalized = scaler.fit_transform(X_train_reshaped)

# reshape back to 3D array (instances, timeframe, features)
X_train_normalized = X_train_normalized.reshape(X_train.shape)

X_test_reshaped = X_test.reshape(-1, X_test.shape[-1])
X_test_normalized = scaler.transform(X_test_reshaped)
X_test_normalized = X_test_normalized.reshape(X_test.shape)

In [53]:
X_train_df = pd.DataFrame(X_train_normalized.reshape(X_train.shape[0], -1)) # reshape X_test into 2D (instances, features*window)
y_df = pd.DataFrame(y_train)
X_train_file_path = "./data/X_train_transformed.csv"
y_file_path = "./data/y_train_transformed.csv"

X_train_df.to_csv(X_train_file_path, index=False, sep=' ')
y_df.to_csv(y_file_path, index=False, sep=' ')

print("Transformed instances saved to:", X_train_file_path)
print("Transformed labels saved to:", y_file_path)

Transformed instances saved to: ./data/X_train_transformed.csv
Transformed labels saved to: ./data/y_train_transformed.csv


In [55]:
X_test_df = pd.DataFrame(X_test_normalized.reshape(X_test.shape[0], -1))
y_df = pd.DataFrame(y_test)

X_test_file_path = "./data/X_test_transformed.csv"
y_file_path = "./data/y_test_transformed.csv"

X_test_df.to_csv(X_test_file_path, index=False, sep=' ')
y_df.to_csv(y_file_path, index=False, sep=' ')

print("Transformed instances saved to:", X_test_file_path)
print("Transformed labels saved to:", y_file_path)

Transformed instances saved to: ./data/X_test_transformed.csv
Transformed labels saved to: ./data/y_test_transformed.csv
