In [None]:
# You might need to install gcloud library for Cloud Storage operations
# !pip install gcloud

In [None]:
# Import necessary libraries & Cloud Storage setup
import pandas as pd
import os
import json
from gcloud import storage

# Define Cloud Storage client and bucket to which files will be exported
client = storage.Client()
bucket = client.get_bucket('YOUR_BUCKET_NAME')

#### Data preparation

In [None]:
# Get adresses of files from GCS to CVS files
!gcloud storage ls --recursive gs://datasets-c4ds/lemon-quality/bad_quality/** > bad.csv

In [None]:
!gcloud storage ls --recursive gs://datasets-c4ds/lemon-quality/good_quality/** > good.csv

In [None]:
!gcloud storage ls --recursive gs://datasets-c4ds/lemon-quality/empty_background/** > empty.csv 

In [None]:
# Create DataFrames from CSV files
df_bad = pd.read_csv('bad.csv', header=None)
df_good = pd.read_csv('good.csv', header=None)
df_empty = pd.read_csv('empty.csv', header=None)

In [None]:
# Add labels
df_bad['label'] = 'bad'
df_good['label'] = 'good'
df_empty['label'] = 'empty'

In [None]:
# Merge all 3 DataFrames
df_full = pd.concat([df_bad, df_good, df_empty])
df_full

#### Create CSV input file

In [None]:
# Save input file locally
df_full.to_csv('input_file.csv', index=None, header=None)

In [None]:
# Export input file to Cloud Storage
blob = bucket.blob('lemon-quality/input_file.csv')
blob.upload_from_filename('input_file.csv')

#### Create JSONL input file

In [None]:
# Split full DataFrame into 2 lists: 
# 1st list contains Cloud Storage adresses
path_list = list(df_full.iloc[:,0])

# 2nd list contains labels
label_list = list(df_full.iloc[:,1])

In [None]:
print("Top 3 rows of path list:")
path_list[0:3]

In [None]:
print("Top 3 rows of label list:")
label_list[0:3]

In [None]:
# Use 2 above defined arrays to create an input file according to requirements
input_json = [{"imageGcsUri": path, "classificationAnnotation": {"displayName": label}} for path, label in zip(path_list, label_list)]

In [None]:
# Save input file locally
with open('input_file.jsonl', 'w') as file:
    for entry in input_json:
        json.dump(entry, file)
        file.write('\n')

In [None]:
# Export input file to Cloud Storage
blob = bucket.blob('lemon-quality/input_file.jsonl')
blob.upload_from_filename('input_file.jsonl')