# Step 1: Preparing your dataset

In [1]:
import sagemaker
import boto3
import pandas as pd
import numpy as np

In [2]:
sess = boto3.Session()
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

bucket = sagemaker_session.default_bucket()
prefix = "sagemaker_huggingface_workshop"

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sagemaker_session.default_bucket()}")
print(f"sagemaker session region: {sagemaker_session.boto_region_name}")

sagemaker role arn: arn:aws:iam::829802005956:role/service-role/AmazonSageMaker-ExecutionRole-12345
sagemaker bucket: sagemaker-us-east-1-829802005956
sagemaker session region: us-east-1


## Preparing the dataset

In [3]:
df = pd.read_csv('./data/Womens Clothing E-Commerce Reviews.csv')
df = df[['Review Text',	'Rating']]
df.columns = ['text', 'label']
df['label'] = df['label'] - 1

df = df.dropna()

train, validate, test = \
              np.split(df.sample(frac=1, random_state=42), 
                       [int(.6*len(df)), int(.8*len(df))])

train.shape, validate.shape, test.shape

((13584, 2), (4528, 2), (4529, 2))

In [4]:
train.head(10)

Unnamed: 0,text,label
13365,This sweater is so beautiful on. it is thick m...,4
19834,This piece is almost what i want... i tried on...,3
18722,Really like this blouse but am returning for a...,3
10635,These are the perfect light weight relaxing su...,4
7348,These look nothing like the picture! they are ...,1
6321,I ordered this shirt and was happy to see that...,4
2494,"On the hanger, this looks like a great lbd. on...",3
6889,Love this cardigan! i've been searching for a ...,4
18809,I was immediately drawn to the ochre yellow co...,4
5446,I loved this jacket! it's exactly what i was l...,4


In [5]:
train.to_csv(   './data/train.csv'   , index=False)
validate.to_csv('./data/validate.csv', index=False)
test.to_csv(    './data/test.csv'    , index=False)

In [6]:
dataset_path = sagemaker_session.upload_data(path='data', key_prefix=f'{prefix}/data')
print(f'Dataset location: {dataset_path}')

Dataset location: s3://sagemaker-us-east-1-829802005956/sagemaker_huggingface_workshop/data
