In [25]:
import json
import pandas as pd
from google.cloud import storage

# Define Cloud Storage client and bucket to which files will be exported
client = storage.Client()
bucket = client.get_bucket('c4ds-europe-west4')

#### Create text classification input file

Source: https://www.kaggle.com/datasets/saurabhshahane/ecommerce-text-classification

In [26]:
df = pd.read_csv('ecommerceDataset.csv', names=['label','text'], header=None)
df.head()

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [27]:
df.shape

(50425, 2)

In [29]:
df.drop_duplicates().head()

Unnamed: 0,label,text
0,Household,Paper Plane Design Framed Wall Hanging Motivat...
1,Household,"SAF 'Floral' Framed Painting (Wood, 30 inch x ..."
2,Household,SAF 'UV Textured Modern Art Print Framed' Pain...
3,Household,"SAF Flower Print Framed Painting (Synthetic, 1..."
4,Household,Incredible Gifts India Wooden Happy Birthday U...


In [3]:
# Split full DataFrame into 2 lists: 
# 1st list contains labels
label_list = list(df.iloc[:,0])

# 2nd list contains text
text_list = list(df.iloc[:,1])

In [4]:
print("Top 3 rows of path list:")
label_list[0:3]

Top 3 rows of path list:


['Household', 'Household', 'Household']

In [5]:
print("Top 3 rows of label list:")
text_list[0:3]

Top 3 rows of label list:


['Paper Plane Design Framed Wall Hanging Motivational Office Decor Art Prints (8.7 X 8.7 inch) - Set of 4 Painting made up in synthetic frame with uv textured print which gives multi effects and attracts towards it. This is an special series of paintings which makes your wall very beautiful and gives a royal touch. This painting is ready to hang, you would be proud to possess this unique painting that is a niche apart. We use only the most modern and efficient printing technology on our prints, with only the and inks and precision epson, roland and hp printers. This innovative hd printing technique results in durable and spectacular looking prints of the highest that last a lifetime. We print solely with top-notch 100% inks, to achieve brilliant and true colours. Due to their high level of uv resistance, our prints retain their beautiful colours for many years. Add colour and style to your living space with this digitally printed painting. Some are for pleasure and some for eternal bli

In [6]:
# Use 2 above defined arrays to create an input file according to requirements
# input_json = [{"imageGcsUri": path, "classificationAnnotation": {"displayName": label}} for path, label in zip(path_list, label_list)]
input_json = [{"classificationAnnotation": {"displayName": label}, "textContent": text} for label, text in zip(label_list, text_list)]

In [7]:
# Save input file locally
with open('input_file_text_classification.jsonl', 'w') as file:
    for entry in input_json:
        json.dump(entry, file)
        file.write('\n')

In [14]:
# Export input file to Cloud Storage
blob = bucket.blob('text/input_file_text_classification.jsonl')
blob.upload_from_filename('input_file_text_classification.jsonl')

#### Create sentiment analysis input file

Source: https://www.kaggle.com/datasets/cosmos98/twitter-and-reddit-sentimental-analysis-dataset

In [None]:
{
  "sentimentAnnotation": {
    "sentiment": number,
    "sentimentMax": number
  },
  "textContent": "inline_text",
  "dataItemResourceLabels": {
    "aiplatform.googleapis.com/ml_use": "training|test|validation"
  }
}
{
  "sentimentAnnotation": {
    "sentiment": number,
    "sentimentMax": number
  },
  "textGcsUri": "gcs_uri_to_file",
  "dataItemResourceLabels": {
    "aiplatform.googleapis.com/ml_use": "training|test|validation"
  }
}

In [16]:
df = pd.read_csv('Reddit_Data.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [17]:
# Map values in category column from [-1,0,1] to [0,1,2] - A sentiment value must be an integer from 0 to 10
df['category'] = df['category'].map({-1: 0, 0: 1, 1: 2})

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,2
1,buddhism has very much lot compatible with chr...,2
2,seriously don say thing first all they won get...,0
3,what you have learned yours and only yours wha...,1
4,for your own benefit you may want read living ...,2


In [19]:
# Split full DataFrame into 2 lists: 
# 1st list contains labels
text_list = list(df.iloc[:,0])

# 2nd list contains text
sentiment_list = list(df.iloc[:,1])

In [20]:
print("Top 3 rows of text list:")
text_list[0:3]

Top 3 rows of text list:


[' family mormon have never tried explain them they still stare puzzled from time time like some kind strange creature nonetheless they have come admire for the patience calmness equanimity acceptance and compassion have developed all the things buddhism teaches ',
 'buddhism has very much lot compatible with christianity especially considering that sin and suffering are almost the same thing suffering caused wanting things shouldn want going about getting things the wrong way christian this would mean wanting things that don coincide with god will and wanting things that coincide but without the aid jesus buddhism could also seen proof god all mighty will and omnipotence certainly christians are lucky have one such christ there side but what about everyone else well many christians believe god grace salvation and buddhism god way showing grace upon others would also help study the things jesus said and see how buddha has made similar claims such rich man getting into heaven joke basic

In [21]:
print("Top 3 rows of sentiment list:")
sentiment_list[0:3]

Top 3 rows of label list:


[2, 2, 0]

In [None]:
{
  "sentimentAnnotation": {
    "sentiment": number,
    "sentimentMax": number
  },
  "textContent": "inline_text",
  "dataItemResourceLabels": {
    "aiplatform.googleapis.com/ml_use": "training|test|validation"
  }
}
{
  "sentimentAnnotation": {
    "sentiment": number,
    "sentimentMax": number
  },
  "textGcsUri": "gcs_uri_to_file",
  "dataItemResourceLabels": {
    "aiplatform.googleapis.com/ml_use": "training|test|validation"
  }
}

In [22]:
# Use 2 above defined arrays to create an input file according to requirements
input_json = [{"sentimentAnnotation": {"sentiment": sentiment, "sentimentMax": 2}, "textContent": text} for sentiment, text in zip(sentiment_list, text_list)]

In [23]:
# Save input file locally
with open('input_file_sentiment_analysis.jsonl', 'w') as file:
    for entry in input_json:
        json.dump(entry, file)
        file.write('\n')

In [24]:
# Export input file to Cloud Storage
blob = bucket.blob('text/input_file_sentiment_analysis.jsonl')
blob.upload_from_filename('input_file_sentiment_analysis.jsonl')