-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_data.py
31 lines (22 loc) · 990 Bytes
/
preprocess_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
import os
import argparse
import pandas as pd
from tqdm import tqdm
import concurrent.futures
from library import process_for_one_device_at_date
parser=argparse.ArgumentParser(description='Execute Data pre-Processing Pipeline')
parser.add_argument('--workers', type=int, help='max cpu worker to be used', default=2)
parser.add_argument('--customer', type=str, help='customer name as per path', required=True)
args = parser.parse_args()
customer=args.customer
workers=args.workers
df=pd.read_csv(f"./Merged/data_{customer}.csv")
df['Date']=df.ts.apply(lambda e: e.split()[0])
params=list(df.groupby(['Date','ID','Loc']))
#Making parent directory
parent_dir=f"./Processed/{customer}"
os.makedirs(parent_dir,exist_ok=True)
with concurrent.futures.ThreadPoolExecutor(max_workers=workers) as executor:
results=list(tqdm(executor.map(lambda e:process_for_one_device_at_date(*e[0],e[1],parent_dir), params),
total=len(params)))
print(f"Completed for {customer}")