In [None]:
import dask
import dask.dataframe as dd
from dask.distributed import LocalCluster, Client
import pyarrow.parquet as pq
import pyarrow as pa

In [None]:
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

In [None]:
dask.config.set({"temporary-directory": "/home/ubuntu/data/dask_tmp"})
dask.config.set({'distributed.worker.memory.target': 0.85})
dask.config.set({'distributed.worker.memory.spill': 0.90})
dask.config.set({'distributed.worker.memory.pause': 0.93})
dask.config.set({'distributed.worker.memory.terminate': 0.96})

In [None]:
def start_cluster(n_workers, threads_per_worker, memory_limit, processes):
    cluster = LocalCluster(
        n_workers=n_workers, threads_per_worker=threads_per_worker, memory_limit=memory_limit, processes=processes
    )
    client = Client(cluster)  # use default n_threads and mem
    print(client)
    print(client.cluster)
    return client

c = start_cluster(n_workers=8, threads_per_worker=1, memory_limit="24GB", processes=True)

In [None]:
path = "Preprocessed/Valid/FeatureExtraction/All_feature_dataset/Valid_with_TE"
schema = pa.Schema.from_pandas(pd.read_parquet(path + "/part.0.parquet", engine='pyarrow'))
df = dd.read_parquet(path, engine='pyarrow')
df

In [None]:
df.columns.to_list()

In [None]:
train_df = df[~df['is_from_official_val']]
val_df = df[df['is_from_official_val']]

val1_df, val2_df = val_df.random_split([2/3, 1/3], random_state=123)

train_df_time = train_df[train_df['tweet_timestamp'] > 1.614011e+09]
train_df_notime = train_df



In [None]:
final_train_noval_notime= train_df_notime
final_train_noval_time = train_df_time
final_train_val_notime = dd.concat([train_df_notime, val1_df], axis=0)
final_train_val_time = dd.concat([train_df_time, val1_df], axis=0)


test_df = val2_df

In [None]:
final_train_noval_notime = final_train_noval_notime.repartition(partition_size="200MB")
final_train_noval_notime.to_parquet("new/final_train_noval_notime", engine='pyarrow', schema=schema, overwrite=True)

In [None]:
final_train_noval_time = final_train_noval_time.repartition(partition_size="200MB")
final_train_noval_time.to_parquet("new/final_train_noval_time", engine='pyarrow', schema=schema, overwrite=True)

In [None]:
final_train_val_notime = final_train_val_notime.repartition(partition_size="200MB")
final_train_val_notime.to_parquet("new/final_train_val_notime", engine='pyarrow', schema=schema, overwrite=True)

In [None]:
final_train_val_time = final_train_val_time.repartition(partition_size="200MB")
final_train_val_time.to_parquet("new/final_train_val_time", engine='pyarrow', schema=schema, overwrite=True)

In [None]:
test_df = test_df.repartition(partition_size="200MB")
test_df.to_parquet("new/test", engine='pyarrow', schema=schema, overwrite=True)