In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os 
import glob

### Preprocessing steps:
- There is header in between data samples in csv files, so we have to remove them.
- The delay value is same for 80 time samples
- First, make sure that each csv file contain time sample with 80 time samples
- Next, check if any field is null or not. If null if found, remove.
- We remove headers, timestamp and delay information and save it as target_X file
- We save single delay information in the target_Y file 

In [None]:
def list_of_files(path):
    if os.path.exists(path):
        files = glob.glob(os.path.join(path,"*.csv"))
        return files 
    else:
        print("Path not found")

#### Check files if 80 samples are present between header

In [25]:
def check_samples(files):
    lookup = 'time_ti'
    n_total = 0
    for f in files:
        with open(f) as file:
            n_samples = 80
            for line_num, line in enumerate(file,1):
                if lookup in line:
                    if n_samples < 80:
                        print(f'less than 80 samples at: {f} in line_header:{header_line}') 
                    elif n_samples == 80:
                        pass
                        #print('80 samples ok')
                    elif n_samples > 80:
                        print(f'greater than 80 samples at: {f} in line_header:{header_line}')
                    n_samples=0
                    header_line = line_num
                else:
                    n_samples += 1  
                    n_total +=1
                if n_samples > 80:
                    print(f'greater than 80 samples at: {f} in line_header:{header_line}')
                if line in ['\n', '\r\n']:
                    print(f'empty lines in {f} at line_num:{line_num}') 
            last_line =  line_num
            if(last_line - header_line > 80 ):
                print(f'greater than 80 samples at: {f} in line_header:{header_line}')
            elif(last_line - header_line < 80):
                print(f'less than 80 samples at: {f} in line_header:{header_line}')
        #print(f'last line number {last_line}')
    print(f'total samples is {n_total}')    

In [45]:
data_path= '../data/raw/Updated_data/ACE_data/'
files =  list_of_files(data_path)
check_samples(files)

total samples is 1800880


In [6]:
### Drop group if na is present in any field

def dropgroup(df):
    n_samples = 80
    return df.groupby(np.arange(len(df))//n_samples).filter(lambda g: g.isnull().any().any() < 1).reset_index(drop=True)


### Create mean of Dataframe
def get_meandf(df):
    n_samples = 80
    return df.groupby(np.arange(len(df))//n_samples).mean()

#### Load all csv files

In [None]:
def load_all_csv(path):
    all_files = glob.glob(os.path.join(path, "*.csv"))
    df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)
    df = df[df.time_ti.astype(str).str.contains('time_ti') == False].astype('float32')
    df =  dropgroup(df)
    return df.reset_index(drop=True)


def load_csv(filepath):
    try:
        df = pd.read_csv(filepath)
        df = df[df.time_ti.astype(str).str.contains('time_ti') == False].astype('float32')
    except AttributeError as e:
        print(f'Attribute error in {filepath}')
    except Exception as e:
        print (f'Exception handling file {filepath}')
    return df

def pre_process(df):
    target =  df['Delay']
    df = df.drop(columns=['time_ti', 'Delay'])
    return df, target


### Check dataframe for null values

In [26]:
def check_df(df):
    n_samples=80
    n_nan = 0 
    for g, dataframe in df.groupby(np.arange(len(df))//n_samples):
        assert dataframe.shape[0] == n_samples
        if dataframe.isnull().any().any():
            n_nan +=1
    return n_nan

In [47]:
data_path= '../data/raw/Updated_data/ACE_data/'
files =  list_of_files(data_path)
nullfiles = []
n = 0
for f in files:
    df = load_csv(f)
    if df.isnull().any().any():
        print(f)
        nullfiles.append(f)
    df, target = pre_process(df)
    nu =  check_df(df)
    n += nu
print( nullfiles)
print(n)
#check_samples(files)

[]
0


In [42]:
df = load_csv('../data/raw/Updated_data/DSCOVR_Data/combined_csv_DSCOVR_MMS_May_2023_edited.csv')
df.shape

(863120, 13)

In [49]:
def pre_process(df):
    target =  df['Delay']
    df = df.drop(columns=['time_ti', 'Delay'])
    return df, target

In [50]:
df = load_all_csv('../data/raw/Updated_data/DSCOVR_Data/')
df_Dscovr, target_Dscovr = pre_process(df)  

In [54]:
df_Dscovr.to_pickle('../data/processed/DSCOVR_X.pkl')
target_Dscovr.to_pickle('../data/processed/DSCOVR_Y.pkl')

In [55]:
df=  load_all_csv('../data/raw/Updated_data/ACE_data/')
df_ACE, target_ACE =  pre_process(df)
df_ACE.to_pickle('../data/processed/ACE_X.pkl')
target_ACE.to_pickle('../data/processed/ACE_Y.pkl')

In [67]:
target_ACE.shape
target_Y = target_ACE.groupby(np.arange(len(df))//80).unique()
target_Y.to_pickle('../data/processed/ACE_Y_mean.pkl')


pandas.core.series.Series

In [77]:
target_Y = target_ACE.groupby(np.arange(len(df))//80).unique()
target_Y = get_meandf(target_ACE)
target_Y.to_pickle('../data/processed/ACE_Y_mean2.pkl')

In [66]:
def get_meandf(df):
    n_samples = 80
    return df.groupby(np.arange(len(df))//n_samples).mean()

mean_ACE =  get_meandf(df_ACE)
mean_ACE.to_pickle('../data/processed/mean_ACE.pkl')

((22300, 11), (22300,))