<a href="https://colab.research.google.com/github/sagacemente/PAN_2023_crypto/blob/main/Crypto_influencers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import

In [1]:
!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers
  Downloading simpletransformers-0.63.9-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.5/250.5 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.11.0-py3-none-any.whl (468 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m468.7/468.7 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m64.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.98-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m29.4 MB/s[0m eta [36m0:0

In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
import os
import pandas as pd
import json
import sklearn
from sklearn.preprocessing import OneHotEncoder
from urllib import request
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [3]:
# Import class Vectorizer
module_url = f"https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/vectorizer.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))
from vectorizer import Vectorizer

# Import class Simulator
module_url = f"https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/simulator.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))
from simulator import Simulator

Fetching https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/vectorizer.py
Fetching https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/simulator.py
Fetching https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/vectorizer.py


# DS class

In [4]:
class Dataset:

    def __init__(self, url:str, n_subtask:int):
        # Values for ds_name are: "fns", "hss", "iss"
        # Values for language are: "it", "de", "it_de", "ja", "tr", "mix"
        self.url = url
        self.subtask = 'subtask' + str(n_subtask)
        
    def fetch_ds_files(self):
        train_set_archive = tf.keras.utils.get_file('pan23-profiling-cryptocurrency-influencers.zip',self.url,
                                            extract=True, archive_format='zip',cache_dir='.',
                                            cache_subdir='')

    def organize_ds_folders(self):
        train_truth_file_path = os.getcwd() + '/pan23-profiling-cryptocurrency-influencers/' + self.subtask + '/train_truth.json'
        f = open(train_truth_file_path, "r")
        self.id_label_dict = {}
        labels = []
        for line in f:
            line = json.loads(line)
            label = line['class']
            user_id = line['twitter user id']
            self.id_label_dict[user_id] = label
        #print('id_label_dict',self.id_label_dict)

        train_texts_path = os.getcwd() + '/pan23-profiling-cryptocurrency-influencers/'  + self.subtask  + '/train_text.json'
        f = open(train_texts_path, "r")
        self.id_texts_dict = {}
        for line in f:
            line = json.loads(line)
            
            texts = line['texts']
            texts = [i['text'] for i in texts]
            texts = '<NEWTW>'.join(texts)
            user_id = line['twitter user id']
            self.id_texts_dict[user_id] = texts
        #print('id_texts_dict', self.id_texts_dict)

        self.df_texts = pd.DataFrame.from_dict(self.id_texts_dict, orient='index', columns=['text'])
        self.df_labels = pd.DataFrame.from_dict(self.id_label_dict, orient='index', columns=['label'])
        # X = self.df_labels['label'].values.reshape(-1, 1)
        # enc = OneHotEncoder().fit(X)
        # X = enc.transform(X).toarray() #.reshape(-1,5)
        # self.df_labels['label'] = X 
        self.df_labels['label'] = pd.Categorical(self.df_labels['label']).codes

        #Dataframe texts and label
        self.df = pd.concat([self.df_texts, self.df_labels], axis=1)
    
    def generate_keras_ds(self, batch_size, left_size=0.8):
        self.ds = tf.data.Dataset.from_tensor_slices((self.df_texts, self.df_labels))
        self.train_set, self.test_set = tf.keras.utils.split_dataset(self.ds, left_size=left_size)
        # for row in self.train_set.take(3):
        #   print(row)
        # for row in self.test_set.take(3):
        #   print(row)
        self.train_set = self.train_set.shuffle(len(self.train_set),seed=1, reshuffle_each_iteration=False)
        self.test_set =  self.test_set.shuffle(len(self.test_set),seed=1, reshuffle_each_iteration=False)
    
    # def clean_df(self, clean):
    #   return []


    def build_ds(self,batch_size, left_size=0.8):
      self.fetch_ds_files()
      self.organize_ds_folders()
      self.generate_keras_ds(batch_size, left_size)

In [5]:
url = 'https://github.com/sagacemente/PAN_2023_crypto/raw/main/pan23-profiling-cryptocurrency-influencers.zip'
ds = Dataset(url, n_subtask= 1)
ds.build_ds(1)

Downloading data from https://github.com/sagacemente/PAN_2023_crypto/raw/main/pan23-profiling-cryptocurrency-influencers.zip


In [None]:
vct_layer_obj = Vectorizer(ds.train_set)
max_features=len(vct_layer_obj.vectorize_layer.get_vocabulary()) + 1

Length of the longest sample is: 378

Vocabulary size is: 6801


# Models

## CNN

In [None]:
nr_runs = 2
embedding_dim = 100
nr_epochs = 4

METRICS = [
  tf.keras.metrics.CategoricalAccuracy(name='acc')
  ]

for run in range(1,(nr_runs+1)):
  epochs_accuracy = []
  model = tf.keras.Sequential([
                                  tf.keras.Input(shape=(1,), dtype=tf.string),
                                  vct_layer_obj.vectorize_layer,
                                  layers.Embedding(max_features + 1, embedding_dim),                     
                                  layers.Dropout(0.8),

                                  layers.Conv1D(256,16,activation='relu'),
                                  layers.MaxPooling1D(),
                                  layers.Dropout(0.6),

                                  layers.Dense(512,activation='relu'),
                        
                                  layers.GlobalAveragePooling1D(),
                                  layers.Dropout(0.2),
                                  layers.Dense(5)                            
  ])
  model.compile(loss='sparse_categorical_crossentropy', optimizer='RMSprop', metrics=METRICS) 

  for epoch in range (0,nr_epochs):
      history = model.fit(
        ds.train_set,
        validation_data = ds.test_set,
        epochs=1,
        shuffle=False,
        verbose=1
        # Comment the following line to do not save and download the model.
        #callbacks=[callbacks]
        )
      #accuracy = history.history['val_binary_accuracy']
      #print("Run: ",run,"/ Accuracy on test set at epoch ",epoch," is: ", accuracy[0],"\n")
      #epochs_accuracy.append(accuracy[0])

  #print("Accuracies over epochs:",epochs_accuracy,"\n")

InvalidArgumentError: ignored

## Simple Transformers

In [6]:
def f1(y_true, y_pred):
  TP = np.sum(np.multiply([i==True for i in y_pred], y_true))
  TN = np.sum(np.multiply([i==False for i in y_pred], [not(j) for j in y_true]))
  FP = np.sum(np.multiply([i==True for i in y_pred], [not(j) for j in y_true]))
  FN = np.sum(np.multiply([i==False for i in y_pred], y_true))
  precision = TP/(TP+FP)
  recall = TP/(TP+FN)
  if precision != 0 and recall != 0:
    f1 = (2 * precision * recall) / (precision + recall)
  else:
    f1 = 0
  return f1

def f1_macro(y_true, y_pred):
  macro = []
  for i in np.unique(y_true):
    modified_true = [i==j for j in y_true]
    modified_pred = [i==j for j in y_pred]
    score = f1(modified_true, modified_pred)
    macro.append(score)
  return np.mean(macro)
    
metric = f1_macro

In [7]:
train_df = ds.df[:-16]
train_df.columns = ["text", "labels"]

eval_df = ds.df[-16:]
eval_df.columns = ["text", "labels"]

# Optional model configuration
model_args = ClassificationArgs(num_train_epochs=20, 
                                overwrite_output_dir=True,
                                manual_seed = 4,
                                use_multiprocessing = True,
                                train_batch_size = 16,
                                eval_batch_size = 2)



# Create a ClassificationModel
model = ClassificationModel(
    'bert',
    'bert-base-cased',
    num_labels=5,
    args=model_args,
    use_cuda=True,
    ) 


# Train the model
model.train_model(train_df,
                  show_running_loss=True,
                  acc= metric #sklearn.metrics.f1_score
                  )

# Evaluate the model
result, model_outputs, wrong_predictions = model.eval_model(eval_df,
                                                            acc=metric)
print('result\n', result)
# Make predictions with the model
#predictions, raw_outputs = model.predict(["Sam was a Wizard"])

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

  0%|          | 0/144 [00:00<?, ?it/s]

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Running Epoch 0 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 1 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 2 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 3 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 4 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 5 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 6 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 7 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 8 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 9 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 10 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 11 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 12 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 13 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 14 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 15 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 16 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 17 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 18 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

Running Epoch 19 of 20:   0%|          | 0/9 [00:00<?, ?it/s]

  0%|          | 0/16 [00:00<?, ?it/s]

Running Evaluation:   0%|          | 0/8 [00:00<?, ?it/s]

result
 {'mcc': 0.6326860007533812, 'acc': 0.6599999999999999, 'eval_loss': 1.207991600036621}


In [9]:
len(wrong_predictions)

5

In [10]:
wrong_predictions

[{'guid': 0, 'text_a': "@JoshBobrowsky I just checked the floor one and it’s not close. Minted for 2.75 ETH at $3,160 is $8,698 mint and currently it’s $22,470 that’s over 150%, $APE claim is $10,753 and lowest mutant otherdeed is $2,600 so $8.7K mint to currently $34K (backing out APE for deed mint), 300%<NEWTW>RT @VentureCoinist: If you add up all the value that @yugalabs has airdropped to BAYC holders, it makes minting an ape one of the greatest…<NEWTW>Regulatory risk is skyrocketing in crypto given $LUNA $UST blow up of ~5% of the market, can find wild swings to trade but this is going to take a while to sort out<NEWTW>Been spending my NFT time during this slow period in the $SOL ecosystem learning it and the community. Volume still smaller $ wise but trend and adoption is still up and to the right. $ETH lumpy based on whale activity. $SOL overall better experience for NFTs imo. https://t.co/N56v8AkZHr<NEWTW>$ETH losing longer term support here. Seems like it has a good bit still t

# Old code

In [None]:
# nsubtask = '1'
# subtask = 'subtask' + nsubtask
# subtask

# train_set_archive = tf.keras.utils.get_file('pan23-profiling-cryptocurrency-influencers.zip',url,
#                                     extract=True, archive_format='zip',cache_dir='.',
#                                     cache_subdir='')

# train_truth_file_path = os.getcwd() + '/pan23-profiling-cryptocurrency-influencers/' + subtask + '/train_truth.json'
# f = open(train_truth_file_path, "r")
# id_label_dict = {}
# labels = []
# for line in f:
#     line = json.loads(line)
#     label = line['class']
#     user_id = line['twitter user id']
#     id_label_dict[user_id] = label
# print('id_label_dict',id_label_dict)

# train_texts_path = os.getcwd() + '/pan23-profiling-cryptocurrency-influencers/'  + subtask  + '/train_text.json'
# f = open(train_texts_path, "r")
# id_texts_dict = {}
# for line in f:
#     line = json.loads(line)
    
#     texts = line['texts']
#     texts = [i['text'] for i in texts]
#     texts = '<NEWTW>'.join(texts)
#     user_id = line['twitter user id']
    
#     id_texts_dict[user_id] = texts
# print('id_texts_dict', id_texts_dict)

# df_texts = pd.DataFrame.from_dict(id_texts_dict, orient='index', columns=['text'])
# df_labels = pd.DataFrame.from_dict(id_label_dict, orient='index', columns=['label'])
# df_labels['label'] = pd.Categorical(df_labels['label']).codes
# X = df_labels['label'].values.reshape(-1, 1)
# enc = OneHotEncoder().fit(X)
# X = enc.transform(X).toarray()
# df_labels['label'] = X 
# df = pd.concat([df_texts, df_labels], axis=1)
# ds = tf.data.Dataset.from_tensor_slices((df_texts, df_labels))
# train_ds, test_ds = tf.keras.utils.split_dataset(ds, left_size=0.8)
# for row in train_ds.take(3):
#   print(row)
# for row in test_ds.take(3):
#   print(row)