In [1]:
!pip install mlflow
!pip install azure-storage-blob

Collecting mlflow
  Downloading mlflow-2.13.2-py3-none-any.whl.metadata (29 kB)
Collecting cachetools<6,>=5.0.0 (from mlflow)
  Downloading cachetools-5.3.3-py3-none-any.whl.metadata (5.3 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting querystring-parser<2 (from mlflow)
  Downloading querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting gunicorn<23 (from mlflow)
  Downloading gunicorn-22.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.3-py3-none-any.whl.metadata (10 kB)
Collecting graphql-relay<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_relay-3.2.0-py3-none-any.whl.metadata (12 kB)
Collecting aniso8601<10,>=8 (from graphene<4->mlflow)
  Downloading aniso8601-9.0.1-py2.py3-none-any.whl.metadata (23 kB)
Downloading mlflow-2.13.2-py3-none-any.whl (25.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
%%sh

# Define the configuration file path
config_file="$HOME/.databrickscfg"

# Create the configuration file with user input
echo "[DEFAULT]" > "$config_file"
echo "host = https://community.cloud.databricks.com/" >> "$config_file"
echo "username = ..." >> "$config_file"
echo "password = ..." >> "$config_file"
echo "jobs-api-version = 2.0" >> "$config_file"

# Set the file permissions to read and write for the user only
chmod 600 "$config_file"

echo "Configuration file created at $config_file"

cat /root/.databrickscfg

Configuration file created at /root/.databrickscfg
[DEFAULT]
host = https://community.cloud.databricks.com/
username = vladyslav.radchenko@outlook.com
password = Dkfldkfl007!
jobs-api-version = 2.0


In [4]:
import pandas as pd
from azure.storage.blob import BlobServiceClient
import io

account_url = "..."
sas_token = "..."
container_name = "..."

# Create a BlobServiceClient
blob_service_client = BlobServiceClient(account_url=account_url, credential=sas_token)
container_client = blob_service_client.get_container_client(container_name)

def read_csv_from_blob(file_name):
    """
    Download and read a CSV file from Azure Blob Storage into a pandas DataFrame.
    """
    blob_client = container_client.get_blob_client(blob=file_name)
    try:
        # Download the blob data
        blob_data = blob_client.download_blob()
        # Load the data into a DataFrame
        data_frame = pd.read_csv(io.BytesIO(blob_data.readall()))
        return data_frame
    except Exception as e:
        print(f"Failed to download or parse {file_name}: {e}")
        return None

In [5]:
# read_csv_from_blob("Twitter_Data.csv"), read_csv_from_blob("Reddit_Data.csv")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from transformers import RobertaTokenizer, RobertaForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
import torch.nn as nn
import torch.optim as optim
import warnings
warnings.filterwarnings('ignore')

In [6]:
data = read_csv_from_blob("Twitter_Data.csv")

data.dropna(inplace=True)
data.drop_duplicates(subset=['clean_text'], inplace=True)

texts = data['clean_text'].values
labels = data['category'].values  
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

label_encoder = LabelEncoder()
encoded_labels_train = label_encoder.fit_transform(train_labels)
label_mapping = {original_label: int_label for original_label, int_label in zip(train_labels, encoded_labels_train)}

label_encoder = LabelEncoder()
encoded_labels_valid = label_encoder.fit_transform(val_labels)
label_mapping = {original_label: int_label for original_label, int_label in zip(val_labels, encoded_labels_valid)}

In [7]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=3)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from torch.utils.data import Dataset, DataLoader

class Sentiment(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        encoding = self.tokenizer.encode_plus(
            text,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

train_labels = encoded_labels_train
val_labels = encoded_labels_valid

train_data = Sentiment(train_texts, train_labels, tokenizer, max_len=128)
val_data = Sentiment(val_texts, val_labels, tokenizer, max_len=128)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
val_loader = DataLoader(val_data, batch_size=32)

In [9]:
def get_test_scores():
    reddit = read_csv_from_blob("Reddit_Data.csv")
    reddit.rename(columns={'clean_comment': 'clean_text'}, inplace=True)

    reddit.dropna(inplace=True)

    reddit.drop_duplicates(subset=['clean_text'], inplace=True)

    reddit.reset_index(drop=True, inplace=True)

    from torch.utils.data import DataLoader

    test_enc_labels = label_encoder.transform(reddit['category'])
    test_dataset = Sentiment(reddit['clean_text'], test_enc_labels, tokenizer, max_len=128)
    test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

    model.eval()

    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)

            all_predictions.extend(predictions.cpu().numpy())
            all_true_labels.extend(labels.cpu().numpy())

    predicted_labels = label_encoder.inverse_transform(all_predictions)
    true_labels = label_encoder.inverse_transform(all_true_labels)
    
    accuracy = accuracy_score(true_labels, predicted_labels)
    report_dict = classification_report(true_labels, predicted_labels, output_dict=True)

    return accuracy, report_dict

In [14]:
import mlflow

mlflow.set_tracking_uri("databricks")
mlflow.set_experiment("/Users/vladyslav.radchenko@outlook.com/MLOPs Tracking")
#S5gbYBc=RQJ8@n8


from transformers import AdamW


with mlflow.start_run(run_name="MLOPs Tracking"):
    mlflow.log_param("learning_rate", 1e-5)
    optimizer = AdamW(model.parameters(), lr=1e-5)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    for epoch in range(1):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs[0]
            loss.backward()
            optimizer.step()

        # Validation loop
        model.eval()
        val_loss = 0
        val_accuracy = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs[0]
                val_loss += loss.item()

                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                accuracy = accuracy_score(labels.cpu(), predictions.cpu())
                val_accuracy += accuracy

        val_loss /= len(val_loader)
        val_accuracy /= len(val_loader)

    mlflow.pytorch.log_model(model, "model")

    accuracy, report_dict = get_test_scores()
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("macro_avg_precision", report_dict["macro avg"]["precision"])
    mlflow.log_metric("macro_avg_recall", report_dict["macro avg"]["recall"])
    mlflow.log_metric("macro_avg_f1-score", report_dict["macro avg"]["f1-score"])



Uploading artifacts:   0%|          | 0/10 [00:00<?, ?it/s]