# Access the MentalRiskEs data and interact with the server

This notebook has been developed by the [SINAI](https://sinai.ujaen.es/) research group for its usage in the [MentalRiskES](https://sites.google.com/view/mentalriskes/) evaluation campaign at IberLEF 2023.

**NOTE 1**: Please visit the [MentalRiskES competition website](https://sites.google.com/view/mentalriskes/evaluation) to read the instructions about how to download the data and interact with the server to send the predictions of your system.

**NOTE 2**: Along the code, please replace "URL" by the URL server and "TOKEN" by your personal token.

Remember this is a support to help you to develop your own system of communication with our server. We recommend you to download it as a Python script instead of working directly on colab and adapt the code to your needs. 

# Install CodeCarbon package

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install codecarbon

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting codecarbon
  Downloading codecarbon-2.2.1-py3-none-any.whl (171 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m171.3/171.3 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting arrow (from codecarbon)
  Downloading arrow-1.2.3-py3-none-any.whl (66 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.4/66.4 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Collecting pynvml (from codecarbon)
  Downloading pynvml-11.5.0-py3-none-any.whl (53 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m53.1/53.1 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
Collecting fuzzywuzzy (from codecarbon)
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy, pynvml, arrow, codecarbon
Successfully installed arrow-1.2.3 codecarbon-2.2.1 fuzzywuzzy-0.18.0 pynvml-11.5.0


In [None]:
!pip install transformers==4.28.0 accelerate datasets sentence-transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.19.0-py3-none-any.whl (219 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m219.1/219.1 kB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.12.0-py3-none-any.whl (474 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Coll

# Import libraries

In [None]:
import requests, zipfile, io
from typing import List, Dict
from requests.adapters import HTTPAdapter, Retry
import random
import json
import os
import pandas as pd
from codecarbon import EmissionsTracker

In [None]:
%cd /content/drive/MyDrive/Master/NLP/MentalRISK2023-Task2/Código

/content/drive/MyDrive/Master/NLP/MentalRISK2023-Task2/Código


# Endpoints
These URL addresses are necessary for the connection to the server. 

**IMPORTANT:** Replace "URL" by the URL server and "TOKEN" by your user token.

In [None]:
URL = "http://s3-ceatic.ujaen.es:8036/"  
TOKEN = "c90775da4a16e3d5ed1cda75fa9d53424dc7ba78" 

# Download endpoints
ENDPOINT_DOWNLOAD_MESSAGES_TRIAL = URL+"{TASK}/download_trial/{TOKEN}"
ENDPOINT_DOWNLOAD_GOLD_TRIAL = URL+"{SUBTASK}/download_trial/{TOKEN}"
ENDPOINT_DOWNLOAD_MESSAGES_TRAIN = URL+"{TASK}/download_train/{TOKEN}"
ENDPOINT_DOWNLOAD_GOLD_TRAIN = URL+"{SUBTASK}/download_train/{TOKEN}"

# Trial endpoints
ENDPOINT_GET_MESSAGES_TRIAL = URL+"{TASK}/getmessages_trial/{TOKEN}"
ENDPOINT_SUBMIT_DECISIONS_TRIAL = URL+"{SUBTASK}/submit_trial/{TOKEN}/{RUN}"

# Test endpoints
ENDPOINT_GET_MESSAGES = URL+"{TASK}/getmessages/{TOKEN}"
ENDPOINT_SUBMIT_DECISIONS = URL+"{SUBTASK}/submit/{TOKEN}/{RUN}"

# Download Data
To download the data, you can make use of the **functions defined in the following**.

The following function download the trial data. To adapt it to download the train and test data, follow the instructions given in the [website of the competition](https://sites.google.com/view/mentalriskes/evaluation).

In [None]:
def download_messages_trial(task: str,subtasks:List[str], token: str) -> List[Dict]:
    response = requests.get(ENDPOINT_DOWNLOAD_MESSAGES_TRIAL.format(TASK=task, TOKEN=token))

    if response.status_code != 200:
        print("Trial - Status Code " + task + ": " + str(response.status_code) + " - Error: " + str(response.text))
    else:
      z = zipfile.ZipFile(io.BytesIO(response.content))
      os.makedirs("./data/{task}/trial/subjects_trial/".format(task=task))
      z.extractall("./data/{task}/trial/subjects_trial/".format(task=task))

    for subtask in subtasks:
        response = requests.get(ENDPOINT_DOWNLOAD_GOLD_TRIAL.format(SUBTASK=subtask, TOKEN=token))
        
        if response.status_code != 200:
            print("Trial - Status Code " + subtask + ": " + str(response.status_code) + " - Error: " + str(response.text))
        else:
          file_object = open("./data/{task}/trial/gold_trial_{subtask}.txt".format(task=task, subtask=subtask), "w")
          file_object.write(response.text)

# Client Server
This class simulates communication with our server. The following code established the conection with the server client and simulate the GET and POST requests. 

**IMPORTANT NOTE:** Please pay attention to the basic functions and remember that it is only a base for your system. 

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

In [None]:
%cd modeloD
import config as d_conf
import task_d
model_d_ = task_d.load_model(d_conf.MODEL_PATH, d_conf.SERIALIZATION_METHOD)
%cd ..

/content/drive/MyDrive/Master/NLP/MentalRISK2023-Task2/Código/modeloD
/content/drive/MyDrive/Master/NLP/MentalRISK2023-Task2/Código


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
class ModelD:

  def __init__(self, model):
      self.model = model

  def scale_predict(self, y):
    y = y/y.sum(axis=1).reshape(-1, 1)
    return y
  
  def predict(self, messages:list):
    # make predictions
    y_pred = self.scale_predict(self.model.predict(messages))
    return y_pred

In [None]:
messages = task_d.load_data('modeloD/server_test.json').to_dict(orient='records')[:20]
messages_df = task_d.preprocess_test_data(pd.DataFrame(messages))
messages = messages_df['message'].tolist()
tokenized_messages = [tokenize(m) for m in messages]

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("modeloB")
model_b = Trainer(
  model=model,
  args=TrainingArguments("./loaded_outputs_b"),
  eval_dataset=messages
)
b_pred = model_b.predict(tokenized_messages)

In [None]:
print(''.join(f'\n-{i} {m}' for i,m in enumerate(messages)))


-0 Hoy es un dia hermoso , y somos personas hermosas , hagamos lo que nos gusta , no dejemos que un segundo feo nos arruine el dia , hay que vivir chicoos , siempre sonriendo
-1 Y pues la anclo para quien sepa responder | O una serie de preguntas mejor dicho
-2 Yo creo que si solo te lo puede sacar el porque tiene tu pasaporte , se lo digas , un dia se lo tendras que decir . Mas tardes mas se alarga tu espera x irte
-3 Forma parte de vivir todos o bueno almenos una parte desea morirse en la vida nosotros la vivimos mas dura pero es una razon por la cual seguir fuerte cada dia
-4 no estudie , lo hacia por gusto antes de meterme en la depresión
-5 Ni granadino ni malagueño
-6 Kawaii se fuerte , orgulleselo con triunfos , logros
-7 Buen día mi nombre es Guillermo , tengo 48 años el próximo 12 de diciembre cumpliré 49 años , hace 10 años decidí dejar de consumir sustancias ilícitas , antes sentía que no valía nada y mi vida era una porquería , lo había dejado todo porque quería vivir anes

In [None]:
import torch
out = model(torch.tensor([a['input_ids'] for a in tokenized_messages]))

In [141]:
from transformers import AutoTokenizer
from pprint import pprint

model_name="bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
MAX_LENGTH = 512

def tokenize(example):
     return tokenizer(example, padding="max_length", truncation=True, max_length=MAX_LENGTH)
     #return tokenizer(example["message"], padding="max_length", truncation=True, max_length=MAX_LENGTH)
    
class Client_task2:
    def __init__(self, task: str, subtasks: List[str], token: str, number_of_runs: int, tracker: EmissionsTracker):
        # Task in which you participate
        self.task = task
        # Subtasks in which you participate
        self.subtasks = subtasks
        # Token identifier
        self.token = token
        # Number of runs (Max: 3)
        self.number_of_runs = number_of_runs
        # Object to calculate CO2 emissions
        self.tracker = tracker
        self.relevant_cols = ['duration', 'emissions', 'cpu_energy', 'gpu_energy', 'ram_energy', 
            'energy_consumed', 'cpu_count', 'gpu_count', 'cpu_model', 'gpu_model', 'ram_total_size']

    # Here a GET request is sent to the server to extract the data.
    def get_messages(self, retries: int, backoff: float) -> Dict:
        session = requests.Session()
        retries = Retry( 
                        total = retries,
                        backoff_factor = backoff,
                        status_forcelist = [500, 502, 503, 504]
                        )
        session.mount('https://', HTTPAdapter(max_retries=retries))
        response = session.get(ENDPOINT_GET_MESSAGES.format(TASK=self.task, TOKEN=self.token))
        if response.status_code != 200:
          print("GET - Status Code " + self.task + ": " + str(response.status_code) + " - Error: " + str(response.text))
          return []
        else:
          return json.loads(response.content)

    # The POST requests are sent to the server to send predictions and carbon emission data
    def submit_decission(self, subtask: int, decisions: Dict, emissions:Dict, retries, backoff):

        data = {
            "predictions": decisions,
            "emissions": emissions
        }

        data = json.dumps(data)
        # Session to POST request
        session = requests.Session()
        retries = Retry(
                        total = retries,
                        backoff_factor = backoff,
                        status_forcelist = [500, 502, 503, 504]
                        )
        session.mount('https://', HTTPAdapter(max_retries=retries))

        for run in range(0,self.number_of_runs):
            # For each run, new decisions
            response = session.post(ENDPOINT_SUBMIT_DECISIONS.format(SUBTASK=self.subtasks[subtask], TOKEN=self.token, RUN=run), json=[data])
            if response.status_code != 200:
                print("POST - Status Code " + self.task + ": " + str(response.status_code) + " - Error: " + str(response.text))
            else:
                print("Subtask {}: - run {}".format(self.subtasks[subtask], run))
        

    # Main thread
    def run_task2(self, retries: int, backoff: float):
        # Get messages for taskX
        messages = self.get_messages(retries, backoff)
        # messages = task_d.load_data('modeloD/server_test.json').to_dict(orient='records')[:20]
        # If there are no messages
        if len(messages) == 0:
            print("All rounds processed")
            return

            
        # # carga de modelo
        # # A
        # model = AutoModelForSequenceClassification.from_pretrained("modeloA")
        # model_a = Trainer(
        #   model=model,
        #   args=TrainingArguments("./loaded_outputs_a"),
        #   eval_dataset=messages
        # )

        #model_a.evaluate()

        # B
        model = AutoModelForSequenceClassification.from_pretrained("modeloB")
        model_b = Trainer(
          model=model,
          args=TrainingArguments("./loaded_outputs_b"),
          eval_dataset=messages
        )

        # C
        # model = AutoModelForSequenceClassification.from_pretrained("modeloC")
        # model_c = Trainer(
        #   model=model,
        #   args=TrainingArguments("./loaded_outputs_c"),
        #   eval_dataset=messages
        # )

        #model_c.evaluate()

        # D
        model_d = ModelD(model_d_)
        
        # c_labels = {'suffer+against': 0, 'suffer+in favour': 1, 'suffer+other': 2, 'control': 3}
        # c_labels_rev = {v:k for k,v in c_labels.items()}

        while len(messages) > 0:
            # messages[0]['round'] = 1
            print("------------------- Processing round {}".format(messages[0]["round"]))
            # Save subjects
            with open('./data/rounds_trial/round{}.json'.format(messages[0]["round"]), 'w+', encoding='utf8') as json_file:
                json.dump(messages, json_file, ensure_ascii=False)
              
            pprint(messages)

            # Calculate emissions for each prediction
            messages_df = task_d.preprocess_test_data(pd.DataFrame(messages))
            messages = messages_df['message'].tolist()
            tokenized_messages = [tokenize(m) for m in messages]
            print("First messages:",messages[:2])

            # task b
            self.tracker.start()
            print("predicting task B...")
            b_pred_raw = model_b.predict(tokenized_messages)
            b_pred = np.clip(b_pred_raw.predictions, 0, 1)
            print(b_pred)
            b_pred_df = pd.DataFrame(b_pred, columns=['pred_b'])
            b_pred_df = pd.concat([messages_df[['nick']], b_pred_df], axis=1)\
                .groupby('nick')\
                .first()
            decisionsB = b_pred_df['pred_b'].apply(lambda x: np.round(x, 5)).to_dict()
            print("Decisions B:")
            pprint(decisionsB)
        
            # task a
            decisionsA = b_pred_df['pred_b'].round().astype(int).to_dict()
            print("Decisions A:")
            pprint(decisionsA)
            
            print("Emmisions of A and B:")
            self.tracker.stop()
            df = pd.read_csv("emissions.csv")
            measurementsAB = df.iloc[-1][self.relevant_cols].to_dict()
            pprint(measurementsAB)

            print("predicting task D...")
            self.tracker.start()
            d_pred = model_d.predict(messages)
            d_pred_df = pd.DataFrame(d_pred, columns=task_d.conf.LABELS)
            d_pred_df = pd.concat([messages_df[['nick']], d_pred_df], axis=1)\
                .groupby('nick')\
                .first()
            decisionsD = d_pred_df[task_d.conf.LABELS].round(5).to_dict(orient='index')
            print("Decisions D:")
            pprint(decisionsD)

            print("Decisions C:")
            decisionsC = d_pred_df[task_d.conf.LABELS]\
                .apply(np.argmax, axis=1)\
                .replace(dict(enumerate(d_pred_df.columns)))\
                .to_dict()
            pprint(decisionsC)
           
            print("Emmisions of C and D:")
            self.tracker.stop()
            df = pd.read_csv("emissions.csv")
            measurementsCD = df.iloc[-1][self.relevant_cols].to_dict()
            pprint(measurementsCD)

            self.submit_decission(0, decisionsA, measurementsAB, retries, backoff) # task2a
            self.submit_decission(1, decisionsB, measurementsAB, retries, backoff) # task2b
            self.submit_decission(2, decisionsC, measurementsCD, retries, backoff) # task2c
            self.submit_decission(3, decisionsD, measurementsCD, retries, backoff) # task2d

            # Only one GET request for each round
            messages = self.get_messages(retries, backoff)
            # messages = []

        print("All rounds processed")

In [118]:
model = AutoModelForSequenceClassification.from_pretrained("modeloB")
model_b = Trainer(
  model=model,
  args=TrainingArguments("./loaded_outputs_b"),
  eval_dataset=messages
)

[codecarbon INFO @ 10:19:09] [setup] RAM Tracking...
[codecarbon INFO @ 10:19:09] [setup] GPU Tracking...
[codecarbon INFO @ 10:19:09] Tracking Nvidia GPU via pynvml
[codecarbon INFO @ 10:19:09] [setup] CPU Tracking...
[codecarbon INFO @ 10:19:12] CPU Model on constant consumption mode: Intel(R) Xeon(R) CPU @ 2.30GHz
[codecarbon INFO @ 10:19:12] >>> Tracker's metadata:
[codecarbon INFO @ 10:19:12]   Platform system: Linux-5.15.107+-x86_64-with-glibc2.31
[codecarbon INFO @ 10:19:12]   Python version: 3.10.11
[codecarbon INFO @ 10:19:12]   CodeCarbon version: 2.2.1
[codecarbon INFO @ 10:19:12]   Available RAM : 12.678 GB
[codecarbon INFO @ 10:19:12]   CPU count: 2
[codecarbon INFO @ 10:19:12]   CPU model: Intel(R) Xeon(R) CPU @ 2.30GHz
[codecarbon INFO @ 10:19:12]   GPU count: 1
[codecarbon INFO @ 10:19:12]   GPU model: 1 x Tesla T4


In [125]:
pd.read_csv("emissions.csv").iloc[-1].to_dict()

{'timestamp': '2023-05-23T10:19:46',
 'project_name': 'codecarbon',
 'run_id': '485169cb-887a-44fc-80cc-23f78534aca1',
 'duration': 22.07438945770264,
 'emissions': 6.627390819588935e-06,
 'emissions_rate': 3.002298583291677e-07,
 'cpu_power': 42.5,
 'gpu_power': 72.23,
 'ram_power': 1.9307284355163572,
 'cpu_energy': 8.580601049794092e-06,
 'gpu_energy': 1.4256761204534108e-05,
 'ram_energy': 3.756116278293575e-07,
 'energy_consumed': 2.321297388215756e-05,
 'country_name': 'United States',
 'country_iso_code': 'USA',
 'region': 'south carolina',
 'cloud_provider': nan,
 'cloud_region': nan,
 'os': 'Linux-5.15.107+-x86_64-with-glibc2.31',
 'python_version': '3.10.11',
 'codecarbon_version': '2.2.1',
 'cpu_count': 2,
 'cpu_model': 'Intel(R) Xeon(R) CPU @ 2.30GHz',
 'gpu_count': 1.0,
 'gpu_model': '1 x Tesla T4',
 'longitude': -79.9746,
 'latitude': 32.8608,
 'ram_total_size': 12.67839813232422,
 'tracking_mode': 'process',
 'on_cloud': 'N'}

In [None]:
from pprint import pprint
decisionsB =  b_pred_df['pred_b'].apply(lambda x: np.round(x, 5)).to_dict()
decisionsA = b_pred_df['pred_b'].round().astype(int).to_dict()

decisionsD = d_pred_df[task_d.conf.LABELS].round(5).to_dict(orient='index')

decisionsC = d_pred_df[task_d.conf.LABELS]\
                .apply(np.argmax, axis=1)\
                .replace(dict(enumerate(d_pred_df.columns)))\
                .to_dict()
print("Decisions A")
pprint(decisionsA)
print("Decisions B")
pprint(decisionsB)
print("Decisions C")
pprint(decisionsC)
print("Decisions D")
pprint(decisionsD)

# Main

Please, replace the symbol 'X' by the desired task. For example, for task 1 it would be: task1, task1a and task1b.

In [114]:
def download_data():
    download_messages_trial("task2", ["task2a", "task2b", "task2c", "task2d"], TOKEN)

def get_post_data():
    # Emissions Tracker Config
    config = {
        "save_to_file": True,
        "log_level": "DEBUG",
        "tracking_mode": "process",
        "output_dir": ".", 
    }
    tracker = EmissionsTracker(**config)

    number_runs = 3 # Max: 3

    # Prediction period
    client_task2 = Client_task2("task2", ["task2a", "task2b", "task2c", "task2d"], TOKEN, number_runs, tracker)
    client_task2.run_task2(5, 0.1)

In [None]:
if __name__ == '__main__':
    # download_data()
    get_post_data()