<a href="https://colab.research.google.com/github/sagacemente/PAN_2023_crypto/blob/main/Crypto_influencers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [72]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses
import os
import pandas as pd
import json
from sklearn.preprocessing import OneHotEncoder
from urllib import request

!pip install simpletransformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [69]:
# Import class Vectorizer
module_url = f"https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/vectorizer.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))
from vectorizer import Vectorizer

# Import class Simulator
module_url = f"https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/simulator.py"
module_name = module_url.split('/')[-1]
print(f'Fetching {module_url}')
with request.urlopen(module_url) as f, open(module_name,'w') as outf:
  a = f.read()
  outf.write(a.decode('utf-8'))
from simulator import Simulator

Fetching https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/vectorizer.py
Fetching https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/simulator.py
Fetching https://raw.githubusercontent.com/marco-siino/DA-BT/main/code/vectorizer.py


In [95]:
class Dataset:

    def __init__(self, url:str, n_subtask:int):
        # Values for ds_name are: "fns", "hss", "iss"
        # Values for language are: "it", "de", "it_de", "ja", "tr", "mix"
        self.url = url
        self.subtask = 'subtask' + str(n_subtask)
        
    def fetch_ds_files(self):
        train_set_archive = tf.keras.utils.get_file('pan23-profiling-cryptocurrency-influencers.zip',self.url,
                                            extract=True, archive_format='zip',cache_dir='.',
                                            cache_subdir='')

    def organize_ds_folders(self):
        train_truth_file_path = os.getcwd() + '/pan23-profiling-cryptocurrency-influencers/' + self.subtask + '/train_truth.json'
        f = open(train_truth_file_path, "r")
        self.id_label_dict = {}
        labels = []
        for line in f:
            line = json.loads(line)
            label = line['class']
            user_id = line['twitter user id']
            self.id_label_dict[user_id] = label
        #print('id_label_dict',self.id_label_dict)

        train_texts_path = os.getcwd() + '/pan23-profiling-cryptocurrency-influencers/'  + self.subtask  + '/train_text.json'
        f = open(train_texts_path, "r")
        self.id_texts_dict = {}
        for line in f:
            line = json.loads(line)
            
            texts = line['texts']
            texts = [i['text'] for i in texts]
            texts = '<NEWTW>'.join(texts)
            user_id = line['twitter user id']
            self.id_texts_dict[user_id] = texts
        #print('id_texts_dict', self.id_texts_dict)

        self.df_texts = pd.DataFrame.from_dict(self.id_texts_dict, orient='index', columns=['text'])
        self.df_labels = pd.DataFrame.from_dict(self.id_label_dict, orient='index', columns=['label'])
        X = self.df_labels['label'].values.reshape(-1, 1)
        enc = OneHotEncoder().fit(X)
        X = enc.transform(X).toarray() #.reshape(-1,5)
        self.df_labels['label'] = X 
        #self.df_labels['label'] = pd.Categorical(self.df_labels['label']).codes

        #Dataframe texts and label
        self.df = pd.concat([self.df_texts, self.df_labels], axis=1)
    
    def generate_keras_ds(self, batch_size, left_size=0.8):
        self.ds = tf.data.Dataset.from_tensor_slices((self.df_texts, self.df_labels))
        self.train_set, self.test_set = tf.keras.utils.split_dataset(self.ds, left_size=left_size)
        # for row in self.train_set.take(3):
        #   print(row)
        # for row in self.test_set.take(3):
        #   print(row)
        self.train_set = self.train_set.shuffle(len(self.train_set),seed=1, reshuffle_each_iteration=False)
        self.test_set =  self.test_set.shuffle(len(self.test_set),seed=1, reshuffle_each_iteration=False)

    def build_ds(self,batch_size, left_size=0.8):
      self.fetch_ds_files()
      self.organize_ds_folders()
      self.generate_keras_ds(batch_size, left_size)

In [96]:
url = 'https://github.com/sagacemente/PAN_2023_crypto/raw/main/pan23-profiling-cryptocurrency-influencers.zip'
ds = Dataset(url, n_subtask= 1)
ds.build_ds(1)

In [97]:
vct_layer_obj = Vectorizer(ds.train_set)
max_features=len(vct_layer_obj.vectorize_layer.get_vocabulary()) + 1

Length of the longest sample is: 378

Vocabulary size is: 6801


In [98]:
nr_runs = 2
embedding_dim = 100
nr_epochs = 4

METRICS = [
  tf.keras.metrics.CategoricalAccuracy(name='acc')
  ]

for run in range(1,(nr_runs+1)):
  epochs_accuracy = []
  model = tf.keras.Sequential([
                                  tf.keras.Input(shape=(1,), dtype=tf.string),
                                  vct_layer_obj.vectorize_layer,
                                  layers.Embedding(max_features + 1, embedding_dim),                     
                                  layers.Dropout(0.8),

                                  layers.Conv1D(256,16,activation='relu'),
                                  layers.MaxPooling1D(),
                                  layers.Dropout(0.6),

                                  layers.Dense(512,activation='relu'),
                        
                                  layers.GlobalAveragePooling1D(),
                                  layers.Dropout(0.2),
                                  layers.Dense(5)                            
  ])
  model.compile(loss='sparse_categorical_crossentropy', optimizer='RMSprop', metrics=METRICS) 

  for epoch in range (0,nr_epochs):
      history = model.fit(
        ds.train_set,
        validation_data = ds.test_set,
        epochs=1,
        shuffle=False,
        verbose=1
        # Comment the following line to do not save and download the model.
        #callbacks=[callbacks]
        )
      #accuracy = history.history['val_binary_accuracy']
      #print("Run: ",run,"/ Accuracy on test set at epoch ",epoch," is: ", accuracy[0],"\n")
      #epochs_accuracy.append(accuracy[0])

  #print("Accuracies over epochs:",epochs_accuracy,"\n")

InvalidArgumentError: ignored

In [58]:
# nsubtask = '1'
# subtask = 'subtask' + nsubtask
# subtask

# train_set_archive = tf.keras.utils.get_file('pan23-profiling-cryptocurrency-influencers.zip',url,
#                                     extract=True, archive_format='zip',cache_dir='.',
#                                     cache_subdir='')

# train_truth_file_path = os.getcwd() + '/pan23-profiling-cryptocurrency-influencers/' + subtask + '/train_truth.json'
# f = open(train_truth_file_path, "r")
# id_label_dict = {}
# labels = []
# for line in f:
#     line = json.loads(line)
#     label = line['class']
#     user_id = line['twitter user id']
#     id_label_dict[user_id] = label
# print('id_label_dict',id_label_dict)

# train_texts_path = os.getcwd() + '/pan23-profiling-cryptocurrency-influencers/'  + subtask  + '/train_text.json'
# f = open(train_texts_path, "r")
# id_texts_dict = {}
# for line in f:
#     line = json.loads(line)
    
#     texts = line['texts']
#     texts = [i['text'] for i in texts]
#     texts = '<NEWTW>'.join(texts)
#     user_id = line['twitter user id']
    
#     id_texts_dict[user_id] = texts
# print('id_texts_dict', id_texts_dict)

# df_texts = pd.DataFrame.from_dict(id_texts_dict, orient='index', columns=['text'])
# df_labels = pd.DataFrame.from_dict(id_label_dict, orient='index', columns=['label'])
# df_labels['label'] = pd.Categorical(df_labels['label']).codes
# X = df_labels['label'].values.reshape(-1, 1)
# enc = OneHotEncoder().fit(X)
# X = enc.transform(X).toarray()
# df_labels['label'] = X 
# df = pd.concat([df_texts, df_labels], axis=1)
# ds = tf.data.Dataset.from_tensor_slices((df_texts, df_labels))
# train_ds, test_ds = tf.keras.utils.split_dataset(ds, left_size=0.8)
# for row in train_ds.take(3):
#   print(row)
# for row in test_ds.take(3):
#   print(row)