In [None]:
import json
import pandas as pd
import itertools
from google.cloud import storage

# Define Cloud Storage client and bucket to which files will be exported
client = storage.Client()
bucket = client.get_bucket('c4ds-europe-west4')

#### Create text classification input file

Source: https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification

In [None]:
# Read data
df = pd.read_csv('data/ecommerceDataset.csv', names=['label','text'], header=None)

# Drop duplicates
df.drop_duplicates(inplace=True)

df.head()

In [None]:
# Split full DataFrame into 2 lists: 
# 1st list contains labels
label_list = list(df.iloc[:,0])

# 2nd list contains text
text_list = list(df.iloc[:,1])

In [None]:
# Use 2 above defined arrays to create a JSONL input file according to the requirements
input_json = [{"classificationAnnotation": {"displayName": label}, "textContent": text} for label, text in zip(label_list, text_list)]

In [None]:
# Save input file locally
with open('input_file_text_classification.jsonl', 'w') as file:
    for entry in input_json:
        json.dump(entry, file)
        file.write('\n')

In [None]:
# Export input file to Cloud Storage
blob = bucket.blob('text/input_file_text_classification.jsonl')
blob.upload_from_filename('input_file_text_classification.jsonl')

#### Create sentiment analysis input file

Source: https://www.kaggle.com/datasets/cosmos98/twitter-and-reddit-sentimental-analysis-dataset

In [None]:
# Read data
df = pd.read_csv('data/Reddit_Data.csv')

# Drop duplicates
df.drop_duplicates(inplace=True)

df.head()

In [None]:
# Map values in category column from [-1,0,1] to [0,1,2] - A sentiment value must be an integer from 0 to 10
df['category'] = df['category'].map({-1: 0, 0: 1, 1: 2})

In [None]:
# Split full DataFrame into 2 lists: 
# 1st list contains labels
text_list = list(df.iloc[:,0])

# 2nd list contains text
sentiment_list = list(df.iloc[:,1])

In [None]:
# Use 2 above defined arrays to create a JSONL input file according to requirements
input_json = [{"sentimentAnnotation": {"sentiment": sentiment, "sentimentMax": 2}, "textContent": text} for sentiment, text in zip(sentiment_list, text_list)]

In [None]:
# Save input file locally
with open('input_file_sentiment_analysis.jsonl', 'w') as file:
    for entry in input_json:
        json.dump(entry, file)
        file.write('\n')

In [None]:
# Export input file to Cloud Storage
blob = bucket.blob('text/input_file_sentiment_analysis.jsonl')
blob.upload_from_filename('input_file_sentiment_analysis.jsonl')