# Data Preprocessing: 
- ##  Clean and preprocess the raw data to handle noise, missing values, and time synchronization issues.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
## Loading the Dataset 
df = pd.read_csv('WCR_tws.csv')
df.head()

Unnamed: 0,Time,Site Name,Point Machine Name,Direction,A Current,A Voltage,B Current,B Voltage,Type of A,Type of B,Polling of A,Polling of B
0,2025-02-10 08:25:51,ARNETHA,101/102,Reverse,00000000000.0,"0.0,31.5,70.5,90.0,99.0,103.5,106.5,108.0,108....","0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...","0.0,106.5,105.0,103.5,103.5,103.5,103.5,105.0,...",TWS,TWS,100,100
1,2025-02-10 08:25:59,ARNETHA,101/102,Normal,"0,4.7,4.7,3.7,3,2.6,2.2,2,1.9,1.8,1.8,1.9,2,2....","0.0,90.0,88.5,97.5,103.5,106.5,108.0,109.5,111...","0.0,4.2,4.3,3.4,2.8,2.4,2.1,1.9,1.9,1.8,1.8,1....","0.0,30.0,63.0,85.5,96.0,103.5,106.5,109.5,111....",TWS,TWS,100,100
2,2024-08-10 06:51:04,ARNETHA,101/102,Reverse,"0.0,1.4,4.5,3.9,3.2,2.7,2.4,2.2,2.1,2.1,2,2.1,...","0.0,58.5,81.0,93.0,99.0,102.0,103.5,103.5,103....","0,5,4.5,3.5,2.8,2.4,2.1,2,2,2.1,2.3,2.4,2.5,2....","0.0,99.0,97.5,96.0,94.5,94.5,96.0,99.0,100.5,1...",TWS,TWS,100,100
3,2024-08-10 06:51:17,ARNETHA,101/102,Normal,"0,4.2,4.9,4,3.2,2.8,2.4,2.2,2.1,2,1.9,1.9,1.9,...","0.0,88.5,82.5,90.0,96.0,100.5,102.0,103.5,105....","0.0,2.9,4.4,3.6,2.9,2.5,2.2,2,1.9,1.8,1.8,1.8,...","0.0,21.0,51.0,76.5,90.0,97.5,102.0,103.5,105.0...",TWS,TWS,100,100
4,2024-08-11 12:08:18,ARNETHA,101/102,Reverse,"0.0,4.2,4.2,3.4,2.9,2.5,2.2,2.1,2,2,2,2.1,2.1,...","0.0,52.5,78.0,91.5,97.5,100.5,103.5,103.5,103....","0,5,4.1,3.3,2.7,2.3,2.1,2,2,2.1,2.2,2.4,2.5,2....","0.0,100.5,99.0,97.5,96.0,96.0,97.5,100.5,100.5...",TWS,TWS,100,100


In [10]:
df.shape

(1385, 12)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1385 entries, 0 to 1384
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   Time                1385 non-null   object
 1   Site Name           1385 non-null   object
 2   Point Machine Name  1385 non-null   object
 3   Direction           1385 non-null   object
 4   A Current           1374 non-null   object
 5   A Voltage           1372 non-null   object
 6   B Current           1373 non-null   object
 7   B Voltage           1370 non-null   object
 8   Type of A           1385 non-null   object
 9   Type of B           1385 non-null   object
 10  Polling of A        1385 non-null   int64 
 11  Polling of B        1385 non-null   int64 
dtypes: int64(2), object(10)
memory usage: 130.0+ KB


In [4]:
# First let's see what the raw data looks like
array_columns = ['A Current', 'A Voltage', 'B Current', 'B Voltage']

for col in array_columns:
    print(f"\nColumn: {col}")
    print("Sample values:")
    for val in df[col].head():
        print(f"Type: {type(val)}, Value: {val}")


Column: A Current
Sample values:
Type: <class 'str'>, Value: 0,0,0,0,0,0,0,0,0,0,0.0
Type: <class 'str'>, Value: 0,4.7,4.7,3.7,3,2.6,2.2,2,1.9,1.8,1.8,1.9,2,2.2,2.4,2.6,2.7,2.8,2.9,2.9,3,2.9,2.6,2.2,2,1.9,1.9,1.9,1.9,1.8,1.5,0.7,0.0
Type: <class 'str'>, Value: 0.0,1.4,4.5,3.9,3.2,2.7,2.4,2.2,2.1,2.1,2,2.1,2.1,2.1,2.4,2.6,2.9,3,3.2,3.4,3.6,3.7,3.8,3.9,3.9,3.9,3.9,3.9,3.9,3.9,4,4,3.9,3.9,3.9,3.8,3.5,3.2,3,2.7,1.7,0.0
Type: <class 'str'>, Value: 0,4.2,4.9,4,3.2,2.8,2.4,2.2,2.1,2,1.9,1.9,1.9,1.9,2,2.1,2.3,2.4,2.4,2.3,2.1,1.9,1.9,2,2.1,2.2,2.3,2.3,2.4,2.3,2.2,1.9,0.9,0.0
Type: <class 'str'>, Value: 0.0,4.2,4.2,3.4,2.9,2.5,2.2,2.1,2,2,2,2.1,2.1,2.3,2.6,2.8,3,3.2,3.4,3.5,3.6,3.7,3.7,3.8,3.8,3.7,3.7,3.7,3.7,3.7,3.7,3.6,3.6,3.7,3.6,3.3,3,2.8,2.6,2.7,0.0

Column: A Voltage
Sample values:
Type: <class 'str'>, Value: 0.0,31.5,70.5,90.0,99.0,103.5,106.5,108.0,108.0,108.0,108.0,0.0
Type: <class 'str'>, Value: 0.0,90.0,88.5,97.5,103.5,106.5,108.0,109.5,111.0,111.0,111.0,111.0,109.5,108.0,106.5,105.0

## Converting comma-separated strings to arrays 

In [7]:
array_columns = ['A Current', 'A Voltage', 'B Current', 'B Voltage']

def convert_to_array(x):
    if isinstance(x, str):
        try:
            # Split by comma and convert each to float
            # Handle cases where values might have trailing/leading spaces
            return np.array([float(val.strip()) for val in x.split(',') if val.strip()])
        except ValueError:
            # Fallback for any unexpected format
            return np.array([])
    else:
        return np.array([])

for col in array_columns:
    df[col] = df[col].apply(convert_to_array)

In [8]:
df.head()

Unnamed: 0,Time,Site Name,Point Machine Name,Direction,A Current,A Voltage,B Current,B Voltage,Type of A,Type of B,Polling of A,Polling of B
0,2025-02-10 08:25:51,ARNETHA,101/102,Reverse,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 31.5, 70.5, 90.0, 99.0, 103.5, 106.5, 10...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 106.5, 105.0, 103.5, 103.5, 103.5, 103.5...",TWS,TWS,100,100
1,2025-02-10 08:25:59,ARNETHA,101/102,Normal,"[0.0, 4.7, 4.7, 3.7, 3.0, 2.6, 2.2, 2.0, 1.9, ...","[0.0, 90.0, 88.5, 97.5, 103.5, 106.5, 108.0, 1...","[0.0, 4.2, 4.3, 3.4, 2.8, 2.4, 2.1, 1.9, 1.9, ...","[0.0, 30.0, 63.0, 85.5, 96.0, 103.5, 106.5, 10...",TWS,TWS,100,100
2,2024-08-10 06:51:04,ARNETHA,101/102,Reverse,"[0.0, 1.4, 4.5, 3.9, 3.2, 2.7, 2.4, 2.2, 2.1, ...","[0.0, 58.5, 81.0, 93.0, 99.0, 102.0, 103.5, 10...","[0.0, 5.0, 4.5, 3.5, 2.8, 2.4, 2.1, 2.0, 2.0, ...","[0.0, 99.0, 97.5, 96.0, 94.5, 94.5, 96.0, 99.0...",TWS,TWS,100,100
3,2024-08-10 06:51:17,ARNETHA,101/102,Normal,"[0.0, 4.2, 4.9, 4.0, 3.2, 2.8, 2.4, 2.2, 2.1, ...","[0.0, 88.5, 82.5, 90.0, 96.0, 100.5, 102.0, 10...","[0.0, 2.9, 4.4, 3.6, 2.9, 2.5, 2.2, 2.0, 1.9, ...","[0.0, 21.0, 51.0, 76.5, 90.0, 97.5, 102.0, 103...",TWS,TWS,100,100
4,2024-08-11 12:08:18,ARNETHA,101/102,Reverse,"[0.0, 4.2, 4.2, 3.4, 2.9, 2.5, 2.2, 2.1, 2.0, ...","[0.0, 52.5, 78.0, 91.5, 97.5, 100.5, 103.5, 10...","[0.0, 5.0, 4.1, 3.3, 2.7, 2.3, 2.1, 2.0, 2.0, ...","[0.0, 100.5, 99.0, 97.5, 96.0, 96.0, 97.5, 100...",TWS,TWS,100,100


## Checking Null Values 

In [9]:
df.isnull().sum()

Time                  0
Site Name             0
Point Machine Name    0
Direction             0
A Current             0
A Voltage             0
B Current             0
B Voltage             0
Type of A             0
Type of B             0
Polling of A          0
Polling of B          0
dtype: int64