Base Model Outline Objective



Train a basic text-to-text model using a pre-trained Hugging Face transformer (e.g., T5-small) on tokenized data to establish a baseline for evaluation.

We start by installing and importing all the libraries we require for this task

In [None]:
!pip install -r SIGROPM1/model/sigropm/requirements.txt
!pip install -U sagemaker
!pip install boto3 awscli --upgrade


In [None]:
# Core SageMaker libraries
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import Session

# For model training and deployment
from sagemaker.huggingface import HuggingFace
from sagemaker.pytorch import PyTorch
from sagemaker.inputs import TrainingInput

# For data preprocessing and handling
import boto3  # AWS SDK for Python
import pandas as pd
import numpy as np

# For managing S3 bucket and files
from sagemaker.s3 import S3Uploader, S3Downloader


We will now move on to loading our data. Since it is impossible to upload our data on to github, we will upload the data to S3, and then from there on, we will be using it for our subsequent projects and so on. 

In [None]:

s3 = boto3.client('s3')
bucket_name = "squad-training-data"  # Use a valid bucket name
region = "us-west-1"

try:
    s3.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={'LocationConstraint': region}
    )
    print(f"Bucket '{bucket_name}' created successfully.")
except s3.exceptions.BucketAlreadyExists:
    print(f"Bucket '{bucket_name}' already exists.")
except Exception as e:
    print(f"Error creating bucket: {e}")




In [None]:
local_data_path = "/home/sagemaker-user/SIGROPM1/data/expanded_training_data.jsonl"
s3_data_key = "datasets/training_data.jsonl"  # Path in S3

try:
    s3.upload_file(local_data_path, "squad-training-data", s3_data_key)
    print(f"Dataset uploaded to s3://squad-training-data/{s3_data_key}")
except Exception as e:
    print(f"Error uploading dataset: {e}")


In [None]:
!pip install s3fs


In [None]:
import pandas as pd
import s3fs

s3_file_path = "s3://squad-training-data/datasets/training_data.jsonl"

# Load JSONL file into a pandas DataFrame
df = pd.read_json(s3_file_path, lines=True)
print(df.head())


In [None]:
import sagemaker
from sagemaker import get_execution_role

# Get the SageMaker execution role
sagemaker_role = get_execution_role()

print(f"SageMaker Role: {sagemaker_role}")


In [None]:

import pandas as pd
from datasets import Dataset
# Load JSONL file into a pandas DataFrame
df = pd.read_json(s3_file_path, lines=True)
print("Data Sample:")
print(df.head())

# Convert DataFrame to Hugging Face Dataset
dataset = Dataset.from_pandas(df)
print("Dataset Preview:")
print(dataset)

Tokenization proceeds and data splitting

In [None]:
from transformers import T5ForConditionalGeneration

# Load the model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Move model to GPU if available
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
import torch
print(f"Pre-installed PyTorch version: {torch.__version__}")


In [None]:
entry_point = "/home/sagemaker-user/SIGROPM1/model/sigropm/train.py"


In [None]:
from sklearn.model_selection import train_test_split
import json

# Load the dataset
data_path = "/home/sagemaker-user/SIGROPM1/data/expanded_training_data.jsonl"
# Load the dataset
with open(data_path, "r") as file:
    data = [json.loads(line) for line in file]

# Split the data into training and validation sets
train_data, validation_data = train_test_split(data, test_size=0.2, random_state=42)

# Save the splits locally
train_data_path = "/home/sagemaker-user/SIGROPM1/data/train_data.jsonl"
validation_data_path = "/home/sagemaker-user/SIGROPM1/data/validation_data.jsonl"

with open(train_data_path, "w") as train_file:
    for entry in train_data:
        json.dump(entry, train_file)
        train_file.write("\n")

with open(validation_data_path, "w") as validation_file:
    for entry in validation_data:
        json.dump(entry, validation_file)
        validation_file.write("\n")

print(f"Train data saved to {train_data_path}")
print(f"Validation data saved to {validation_data_path}")

In [None]:

# Define the S3 bucket name
bucket_name = "s3-sigrom-model-data-bucket"

# Initialize the S3 client
s3_client = boto3.client("s3")


try:
    s3.create_bucket(
        Bucket=bucket_name,
        CreateBucketConfiguration={'LocationConstraint': region}
    )
    print(f"Bucket '{bucket_name}' created successfully.")
except s3.exceptions.BucketAlreadyExists:
    print(f"Bucket '{bucket_name}' already exists.")
except Exception as e:
    print(f"Error creating bucket: {e}")



In [None]:

# Upload the train dataset to S3
s3_client.upload_file(
    Filename=train_data_path,  # Local path to the train data
    Bucket=bucket_name,        # Name of your S3 bucket
    Key=train_s3_path,         # Path in the S3 bucket
)

# Upload the validation dataset to S3
s3_client.upload_file(
    Filename=validation_data_path,  # Local path to the validation data
    Bucket=bucket_name,             # Name of your S3 bucket
    Key=validation_s3_path,         # Path in the S3 bucket
)

# Generate S3 URIs
train_s3_uri = f"s3://{bucket_name}/{train_s3_path}"
validation_s3_uri = f"s3://{bucket_name}/{validation_s3_path}"

# Print confirmation
print(f"Train data uploaded to: {train_s3_uri}")
print(f"Validation data uploaded to: {validation_s3_uri}")



In [None]:
from sagemaker.inputs import TrainingInput
from sagemaker.pytorch import PyTorch

# Define S3 input
train_s3_uri = "s3://squad-training-data/datasets/training_data.jsonl"
train_input = TrainingInput(train_s3_uri, content_type="application/jsonlines")

estimator = PyTorch(
    entry_point="train.py",
    source_dir="/home/sagemaker-user/SIGROPM1/model/sigropm",  # Directory containing train.py and requirements.txt
    role=sagemaker_role,
    instance_count=1,
    instance_type="ml.t3.xlarge",
    framework_version="1.12.0",
    py_version="py38",
    dependencies=["/home/sagemaker-user/SIGROPM1/model/sigropm/requirements.txt"],  # Ensure requirements.txt is included
    hyperparameters={"epochs": 5, "batch_size": 16},
)




In [None]:
# Run the training job
estimator.fit({"train": train_input})
