In [20]:
!pip install transformers



In [40]:
from google.colab import drive
drive.mount('/content/drive')

import sys
sys.path.append('/content/drive/My\ Drive/')

%cd /content/drive//My\ Drive/languageDetection

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/languageDetection


In [22]:
import pandas as pd
import os 
import numpy as np
import torch

from torch.utils.data import DataLoader, TensorDataset, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification

In [23]:
langs = ['english', 'danish', 'turkish', 'arabic', 'greek']
dataroot = '/content/drive/MyDrive/data/'
label_dict = {'english': 0, 'danish': 1, 'turkish':2, 'arabic':3, 'greek':4}

test_files = {}
li = []

#test dataset
for lang in langs:
    df0 = os.path.join(dataroot, lang, 'test.tsv')
    test_files.update({df0:lang})
    
for filename, lang in test_files.items():
    df1 = pd.read_csv(filename, sep='\t', usecols = list(range(1,3)))
    df1['language'] = label_dict[lang]
    li.append(df1)

frame = pd.concat(li, axis=0, ignore_index = True)
df = frame.sample(frac=1).reset_index(drop=True)
print(df)

                                                  tweet subtask_a  language
0     Sönmüyor ateşimiz, ama alev alevde yanmıyor   ...       NOT         2
1     @USER Ben dokuz senedir böyle yaşıyorum gayet ...       NOT         2
2                     أبوظبي يا شامخة يا دار الأسياد ❤️       NOT         3
3     RT @USER: ده شنو ده يا شعب يا قاسي يرضيكم الحا...       NOT         3
4     بكرا عندي اختبار موت وحياه يا انجح يا احمل الم...       NOT         3
...                                                 ...       ...       ...
8070  İşsiz olduğun halde ebeveynlerin seni zorla uy...       NOT         2
8071  Angels now have 6 runs. Five of them have come...       NOT         0
8072  RT @USER: يا كافر يا زنديق يا مرتد يا<LF>انت ع...       OFF         3
8073  Sayın başkan @USER adana ya denizi getirmişsin...       OFF         2
8074  @USER Είχατε ρωσικά που έχουν γεννήσει τη μαφί...       OFF         4

[8075 rows x 3 columns]


In [24]:
print(len(df[df.language==label_dict['english']]))
print(len(df[df.language==label_dict['danish']]))
print(len(df[df.language==label_dict['turkish']]))
print(len(df[df.language==label_dict['arabic']]))
print(len(df[df.language==label_dict['greek']]))

860
329
3515
1827
1544


In [25]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                          do_lower_case=True)

encoded_data_test = tokenizer.batch_encode_plus(
    df.tweet.values, 
    add_special_tokens=True, 
    return_attention_mask=True, 
    pad_to_max_length=True, 
    max_length=256, 
    return_tensors='pt'
)


input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(df.language.values.astype(int))

dataset_test = TensorDataset(input_ids_test, attention_masks_test, labels_test)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [26]:
batch_size = 3

dataloader_test = DataLoader(dataset_test, 
                              sampler=SequentialSampler(dataset_test), 
                              batch_size=batch_size)

In [27]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [41]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=len(label_dict),
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.to(device)

model.load_state_dict(torch.load('finetuned_BERT_epoch_1_trial.model', map_location=torch.device('cpu')))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

<All keys matched successfully>

In [42]:
def evaluate(dataloader_test):

    model.eval()
    
    loss_test_total = 0
    predictions, true_vals = [], []
    
    for batch in dataloader_test:
        
        batch = tuple(b.to(device) for b in batch)
        
        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():        
            outputs = model(**inputs)
            
        loss = outputs[0]
        logits = outputs[1]
        loss_test_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)
    
    loss_val_avg = loss_test_total/len(dataloader_test) 
    
    predictions = np.concatenate(predictions, axis=0)
    true_vals = np.concatenate(true_vals, axis=0)
            
    return loss_val_avg, predictions, true_vals

In [43]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels):
    label_dict_inverse = {v: k for k, v in label_dict.items()}
    
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    # print(preds_flat)
    # print(labels_flat)

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [44]:
_, predictions, true_vals = evaluate(dataloader_test)
accuracy_per_class(predictions, true_vals)

Class: english
Accuracy: 841/860

Class: danish
Accuracy: 327/329

Class: turkish
Accuracy: 3510/3515

Class: arabic
Accuracy: 1827/1827

Class: greek
Accuracy: 1544/1544



In [45]:
len(np.argmax(predictions, axis=1).flatten())

8075

In [46]:
# label_dict = {'english': 0, 'danish': 1, 'turkish':2, 'arabic':3, 'greek':4}
preds_flat = np.argmax(predictions, axis=1).flatten()
english_li = []
danish_li = []
turkish_li = []
arabic_li = []
greek_li = []

count =0
for index, row in df.iloc[:, :].iterrows():
    if row.language == preds_flat[index]:
        row_value = pd.DataFrame({'tweet': [row.tweet], 'subtask_a': row.subtask_a})
        if row.language==0:
            english_li.append(row_value)
        elif row.language==1:
            danish_li.append(row_value)
        elif row.language==2:
            turkish_li.append(row_value)
        elif row.language==3:
            arabic_li.append(row_value)
        elif row.language==4:
            greek_li.append(row_value)
        else: 
          pass

english_df = pd.concat(english_li, axis=0, ignore_index = True)
danish_df = pd.concat(danish_li, axis=0, ignore_index = True)
turkish_df = pd.concat(turkish_li, axis=0, ignore_index = True)
arabic_df = pd.concat(arabic_li, axis=0, ignore_index = True)
greek_df = pd.concat(greek_li, axis=0, ignore_index = True)
print(english_df)

                                                 tweet subtask_a
0    #TRUMP:  I'd be a real good witness! #JOHNDowd...       NOT
1    #Sugardaddy Retweet if you are under 30 and yo...       NOT
2    #LiberalHypocrisy #TacoBell When Liberals ask ...       OFF
3    #Believe When you have a fat belly then you ar...       NOT
4    #Hillary and @ least 16 other #AngryDemocrats ...       OFF
..                                                 ...       ...
836  #GreatAwakening #QAnon #PatriotsUnited #WWG1WG...       NOT
837  #Conservatism101   It's not about our disagree...       OFF
838  #CNN ruthlessly continues #Fakenews onslot. Wh...       NOT
839  28, 27, 25 and 21 but like,, it’s still really...       OFF
840  Angels now have 6 runs. Five of them have come...       NOT

[841 rows x 2 columns]


In [47]:
%cd /content/drive//My\ Drive/data1/english

/content/drive/My Drive/data1/english


In [48]:
#English
%cd /content/drive//My\ Drive/data1/english
english_df.to_csv('test.csv')

#Danish
%cd /content/drive//My\ Drive/data1/danish
danish_df.to_csv('test.csv')

#Turkish
%cd /content/drive//My\ Drive/data1/turkish
turkish_df.to_csv('test.csv')


#Arabic
%cd /content/drive//My\ Drive/data1/arabic
arabic_df.to_csv('test.csv')

#Greek
%cd /content/drive//My\ Drive/data1/greek
greek_df.to_csv('test.csv')

/content/drive/My Drive/data1/english
/content/drive/My Drive/data1/danish
/content/drive/My Drive/data1/turkish
/content/drive/My Drive/data1/arabic
/content/drive/My Drive/data1/greek


In [57]:
%cd /content/drive//My\ Drive/languageDetection
error_list=[]

for index, row in df.iloc[:, :].iterrows():
    # break
    if row.language != preds_flat[index]:
        row_value = pd.DataFrame({'tweet': [row.tweet], 'true_label': row.language, 'predicted_label': preds_flat[index]})
        error_list.append(row_value)

error_df = pd.concat(error_list, axis=0, ignore_index = True)
error_df.to_csv('error.csv')


/content/drive/My Drive/languageDetection


In [55]:
error_df[error_df.true_label != error_df.predicted_label]

1254                    Didn't work too well, did it? URL
1480    *17. Celine_SwittinS Follback Erza_Jullian #Op...
1604    0-9 : B-1, J-1, R-1, B-2, Q-2, B-3, BX-3, B-4,...
2089           7 fucking years. #MyTwitterAnniversary URL
2111    -tsk stays.... Drop foto HAN JISUNG dong, plea...
2171                 Göğsünde uyumam gereken geceler var.
2698             #LisaxMichaelKors  she is soo beautifull
2852                   26. Biggest accomplishment? school
2885      #ConsTOO THE PLACE FOR FED UP CONSERVATIVES !!!
2891    #ALDUBLoveAndBeyond when you rise in life, you...
4452    #AntifaHorst - whuuuuut?! 😂  #antifa #maassen ...
4457    thats why u r lame duck :))       biz desek li...
4538    #StopEtchecopar? Fuck you all 🖕🖕🖕🖕🖕 Que florez...
4935    6ix9ine aus den speakern, fick deine political...
5046                             @USER ryger du hash. ???
5277                                     Bu iş T A M A M.
5613                                        5k Bitches 🎉💙
5721          