In [1]:
import pandas as pd

In [2]:
df = pd.read_pickle('../../delphes/data/cleaned_tweet_df')

In [3]:
df.head()

Unnamed: 0,mep_id,name,country,group,nat_group,twitter,content,age,sex
0,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,thank much free media independent authorities ...,47,1.0
1,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,the commission adopted major pilot projects fu...,47,1.0
2,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,the commission adopted authored pilot projects...,47,1.0
3,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,day point hour asking one empty slogans unders...,47,1.0
4,197490,Magdalena ADAMOWICZ,Poland,Group of the European People's Party (Christia...,Independent,Adamowicz_Magda,god need defended anyone want name used terror...,47,1.0


In [4]:
df_data = df[[ "group", "content"]]

In [5]:
df_data.columns = ["labels", "texts"]

In [6]:
df_data.labels.unique()

array(["Group of the European People's Party (Christian Democrats)",
       'Group of the Progressive Alliance of Socialists and Democrats in the European Parliament',
       'European Conservatives and Reformists Group',
       'Renew Europe Group', 'Non-attached Members',
       'Identity and Democracy Group',
       'Group of the European United Left - Nordic Green Left',
       'Group of the Greens/European Free Alliance'], dtype=object)

In [7]:
df_data.labels.value_counts()

Group of the European People's Party (Christian Democrats)                                  44252
Group of the Progressive Alliance of Socialists and Democrats in the European Parliament    24519
Renew Europe Group                                                                          18192
Group of the Greens/European Free Alliance                                                  14247
Identity and Democracy Group                                                                10086
European Conservatives and Reformists Group                                                  9438
Group of the European United Left - Nordic Green Left                                        8553
Non-attached Members                                                                         5437
Name: labels, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df_data[["texts"]], df_data["labels"], test_size=0.3)

In [None]:
X_train.to_csv('X_train.txt', sep=' ', index=False)

In [33]:
from transformers import XLNetTokenizer
from transformers import TFAutoModelWithLMHead

tokenizer = XLNetTokenizer.from_pretrained("xlnet-base-cased")

In [34]:
sentences = [i[0] for i in X_train.values.tolist()]

In [35]:
max_len  = 64

full_input_ids = []
full_input_masks = []
full_segment_ids = []

SEG_ID_A   = 0
SEG_ID_B   = 1
SEG_ID_CLS = 2
SEG_ID_SEP = 3
SEG_ID_PAD = 4

UNK_ID = tokenizer.encode("<unk>")[0]
CLS_ID = tokenizer.encode("<cls>")[0]
SEP_ID = tokenizer.encode("<sep>")[0]
MASK_ID = tokenizer.encode("<mask>")[0]
EOD_ID = tokenizer.encode("<eod>")[0]

for i,sentence in enumerate(sentences):
    # Tokenize sentence to token id list
    tokens_a = tokenizer.encode(sentence)
    
    # Trim the len of text
    if(len(tokens_a)>max_len-2):
        tokens_a = tokens_a[:max_len-2]
        
        
    tokens = []
    segment_ids = []
    
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(SEG_ID_A)
        
    # Add <sep> token 
    tokens.append(SEP_ID)
    segment_ids.append(SEG_ID_A)
    
    
    # Add <cls> token
    tokens.append(CLS_ID)
    segment_ids.append(SEG_ID_CLS)
    
    input_ids = tokens
    
    # The mask has 0 for real tokens and 1 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [0] * len(input_ids)

    # Zero-pad up to the sequence length at fornt
    if len(input_ids) < max_len:
        delta_len = max_len - len(input_ids)
        input_ids = [0] * delta_len + input_ids
        input_mask = [1] * delta_len + input_mask
        segment_ids = [SEG_ID_PAD] * delta_len + segment_ids

    assert len(input_ids) == max_len
    assert len(input_mask) == max_len
    assert len(segment_ids) == max_len
    
    full_input_ids.append(input_ids)
    full_input_masks.append(input_mask)
    full_segment_ids.append(segment_ids)
    
    if 3 > i:
        print("No.:%d"%(i))
        print("sentence: %s"%(sentence))
        print("input_ids:%s"%(input_ids))
        print("attention_masks:%s"%(input_mask))
        print("segment_ids:%s"%(segment_ids))
        print("\n")

No.:0
sentence: participate sure candidate argi commissioner understand concept answers nothing modernization precision farming innovation broadband rural areas
input_ids:[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3137, 512, 2036, 17, 617, 3141, 6913, 1111, 2963, 4666, 805, 24041, 13228, 9153, 7767, 13361, 3184, 689, 4, 3, 4, 3]
attention_masks:[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
segment_ids:[4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2]


No.:1
sentence: far set new special committee every tax scandal created permanent structures permanent committee work regularly focused issue tax justice future
input_id

In [36]:
from transformers import TFXLNetForSequenceClassification


# # It's highly recommended to download bert prtrained model first, then save them into local file 
# # In this document, contain confg(txt) and weight(bin) files
model_file_address = 'xlnet-base-cased'

# # Will load config and weight with from_pretrained()
# # Recommand download the model before using
# # Download model from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-pytorch_model.bin"
# # Download model from "https://s3.amazonaws.com/models.huggingface.co/bert/xlnet-base-cased-config.json" 
model = TFXLNetForSequenceClassification.from_pretrained(model_file_address,num_labels=len(tag2idx))


# model = TFXLNetForSequenceClassification.from_pretrained('xlnet-base-cased')

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetForSequenceClassification: ['lm_loss']
- This IS expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFXLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of TFXLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['sequence_summary', 'logits_proj']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [54]:
from transformers import TFAutoModel, AutoTokenizer

model_name = 'xlnet-base-cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModel.from_pretrained(model_name)

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing TFXLNetModel: ['lm_loss']
- This IS expected if you are initializing TFXLNetModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing TFXLNetModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFXLNetModel were initialized from the model checkpoint at xlnet-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFXLNetModel for predictions without further training.
