In [None]:
import ray
import ray.data
import pandas as pd

In [None]:
from hdfs import Config

client = Config(path="./hdfs-docker-cluster/hadoop_config/.hdfscli.cfg").get_client(
    "dev"
)

In [None]:
# Insert files, taken from populate_hdfs
files_to_upload = ["transfers.csv"]

# Check if the file exists
for file in files_to_upload:
    local_path = f"./data/{file}"
    remote_path = f"/tmp/{file}"
    print(f"Checking if {file} exists...")
    if client.status(remote_path, strict=False):
        print(f"{file} exists!")
        # Remove the file, otherwise it exists but can't be mapped to existing data nodes? 
        print(f"Removing {file}...")
        client.delete(remote_path)
    
    print("Uploading file to /tmp...")
    # Upload a file to tmp, to be processed further
    client.upload(remote_path, local_path)

print("/tmp contents: ", client.list("/tmp"))

In [None]:
# Initialize Ray
ray.init()

In [None]:
# Read the CSV file from HDFS
with client.read("/tmp/transfers.csv") as reader:
    file_contents = reader.read().decode('utf-8')

# Load the CSV data into a Pandas DataFrame
from io import StringIO
df = pd.read_csv(StringIO(file_contents))

# Convert the Pandas DataFrame into a Ray Dataset
dataset = ray.data.from_pandas(df)

# Print the dataset
print(dataset.take())

In [None]:
# Shutdown Ray
ray.shutdown()