In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install laserembeddings

Collecting laserembeddings
  Downloading https://files.pythonhosted.org/packages/a2/4b/a9e3ee9f4825bd2bb6b48f26370e2c341860ec0cb2a9a27deea9be6c2299/laserembeddings-1.1.0-py3-none-any.whl
Collecting transliterate==1.10.2
[?25l  Downloading https://files.pythonhosted.org/packages/a1/6e/9a9d597dbdd6d0172427c8cc07c35736471e631060df9e59eeb87687f817/transliterate-1.10.2-py2.py3-none-any.whl (45kB)
[K     |████████████████████████████████| 51kB 2.7MB/s 
[?25hCollecting subword-nmt<0.4.0,>=0.3.6
  Downloading https://files.pythonhosted.org/packages/74/60/6600a7bc09e7ab38bc53a48a20d8cae49b837f93f5842a41fe513a694912/subword_nmt-0.3.7-py2.py3-none-any.whl
Collecting sacremoses==0.0.35
[?25l  Downloading https://files.pythonhosted.org/packages/1f/8e/ed5364a06a9ba720fddd9820155cc57300d28f5f43a6fd7b7e817177e642/sacremoses-0.0.35.tar.gz (859kB)
[K     |████████████████████████████████| 860kB 7.1MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ..

In [None]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.6/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [None]:
import pandas as pd
import numpy as np
import os
from laserembeddings import Laser
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pickle

In [None]:
lang = ['Arabic','English','French']
cols = []
for i in range(1024):
  cols.append(i)

In [None]:
def preprocessing(language):
    path = 'drive/My Drive/CS695'
    data = pd.read_csv(os.path.join(path,language+'.csv'))
    data_to_list = data['tweet'].to_list()

    laser = Laser()
    embeddings = laser.embed_sentences(data_to_list, lang='en') 

    df1=pd.DataFrame(embeddings)
    df2 = pd.DataFrame(data['label'])

    sss1 = StratifiedShuffleSplit(n_splits=1,test_size=0.3,random_state=0)
    a = list(sss1.split(df1, df2))
    train_x,interX = df1.iloc[a[0][0],:],df1.iloc[a[0][1],:]
    train_y,interY = df2.iloc[a[0][0],:],df2.iloc[a[0][1],:]

    sss2 = StratifiedShuffleSplit(n_splits=1,test_size=0.35,random_state=0)
    a = list(sss2.split(interX, interY))
    test_x,val_x = interX.iloc[a[0][0],:],interX.iloc[a[0][1],:]
    test_y,val_y = interY.iloc[a[0][0],:],interY.iloc[a[0][1],:]

    return train_x, test_x, val_x, train_y, test_y, val_y 

In [None]:
def model_for_zeroshot(language, c, seed):
    
    df_ = pd.DataFrame()
    for l in lang:
        
        if l!= language:
            train_x, t_, v_, train_y, ty_, vy_ = preprocessing(l)
            
            _ = pd.concat([train_x, train_y], axis=1)
            
            df_ = pd.concat([_.reset_index(drop=True), df_], axis=0)
            
            
        elif l==language:
            t_, test_x, val_x, t_, test_y, val_y = preprocessing(language)
            
            
    df_ = df_.sample(frac=1,random_state=seed).reset_index(drop=True)

    df_x = df_[cols]
    df_y = df_['label']
  
    model = LogisticRegression(C=c,solver='lbfgs',class_weight='balanced',random_state=seed)
    model.fit(df_x, df_y)
   

    test_predict = model.predict(test_x)
    val_predict = model.predict(val_x)

    test_f1_score = f1_score(test_y, test_predict, average='macro')
    val_f1_score = f1_score(val_y, val_predict, average='macro')
    test_acc = accuracy_score(test_y, test_predict)
    val_acc = accuracy_score(val_y, val_predict)

  #saved_model = pickle.dumps(model) 

    print('Language:',language, '    ','Model loaded: Zero Shot', '    ','c:',c,'    ','seed:',seed)
    print('\n F1 macro score for test:', test_f1_score)
    print('\n F1 macro score for val:', val_f1_score)
    print('\n Accuracy for test:', test_acc)
    print('\n Accuracy for val:', val_acc)
    
    return test_f1_score

In [None]:
for language in ['Arabic','English','French']:
  f1_list = []
  for c in [0.01,0.1,1,10]:
    for seeds in [2018,2019,2020,2021,2022]:
      seed = seeds
      np.random.seed(seed)
      a = model_for_zeroshot(language,c,seed)
      f1_list.append(a)
  
  print(f1_list)
  avg = sum(f1_list)/len(f1_list)

  print('Average F1 macro:',avg,'    ', 'language:',language)

Language: Arabic      Model loaded: Zero Shot      c: 0.01      seed: 2018

 F1 macro score for test: 0.6311284119338336

 F1 macro score for val: 0.6309072913229957

 Accuracy for test: 0.6899911426040743

 Accuracy for val: 0.6973684210526315
Language: Arabic      Model loaded: Zero Shot      c: 0.01      seed: 2019

 F1 macro score for test: 0.6311284119338336

 F1 macro score for val: 0.6309072913229957

 Accuracy for test: 0.6899911426040743

 Accuracy for val: 0.6973684210526315
Language: Arabic      Model loaded: Zero Shot      c: 0.01      seed: 2020

 F1 macro score for test: 0.6311284119338336

 F1 macro score for val: 0.6309072913229957

 Accuracy for test: 0.6899911426040743

 Accuracy for val: 0.6973684210526315
Language: Arabic      Model loaded: Zero Shot      c: 0.01      seed: 2021

 F1 macro score for test: 0.6311284119338336

 F1 macro score for val: 0.6309072913229957

 Accuracy for test: 0.6899911426040743

 Accuracy for val: 0.6973684210526315
Language: Arabic    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 1      seed: 2018

 F1 macro score for test: 0.6104715137236215

 F1 macro score for val: 0.6252801434334379

 Accuracy for test: 0.6536758193091231

 Accuracy for val: 0.6743421052631579


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 1      seed: 2019

 F1 macro score for test: 0.6104715137236215

 F1 macro score for val: 0.6276814328744214

 Accuracy for test: 0.6536758193091231

 Accuracy for val: 0.6759868421052632


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 1      seed: 2020

 F1 macro score for test: 0.6104715137236215

 F1 macro score for val: 0.6276814328744214

 Accuracy for test: 0.6536758193091231

 Accuracy for val: 0.6759868421052632


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 1      seed: 2021

 F1 macro score for test: 0.6104715137236215

 F1 macro score for val: 0.6252801434334379

 Accuracy for test: 0.6536758193091231

 Accuracy for val: 0.6743421052631579


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 1      seed: 2022

 F1 macro score for test: 0.6097339014617077

 F1 macro score for val: 0.6252801434334379

 Accuracy for test: 0.6527900797165633

 Accuracy for val: 0.6743421052631579


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 10      seed: 2018

 F1 macro score for test: 0.5966972098007739

 F1 macro score for val: 0.6029921369847204

 Accuracy for test: 0.6324180690876883

 Accuracy for val: 0.6463815789473685


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 10      seed: 2019

 F1 macro score for test: 0.5966972098007739

 F1 macro score for val: 0.6029921369847204

 Accuracy for test: 0.6324180690876883

 Accuracy for val: 0.6463815789473685


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 10      seed: 2020

 F1 macro score for test: 0.5966972098007739

 F1 macro score for val: 0.6029921369847204

 Accuracy for test: 0.6324180690876883

 Accuracy for val: 0.6463815789473685


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 10      seed: 2021

 F1 macro score for test: 0.5966972098007739

 F1 macro score for val: 0.6029921369847204

 Accuracy for test: 0.6324180690876883

 Accuracy for val: 0.6463815789473685


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: Arabic      Model loaded: Zero Shot      c: 10      seed: 2022

 F1 macro score for test: 0.5966972098007739

 F1 macro score for val: 0.6029921369847204

 Accuracy for test: 0.6324180690876883

 Accuracy for val: 0.6463815789473685
[0.6311284119338336, 0.6311284119338336, 0.6311284119338336, 0.6311284119338336, 0.6311284119338336, 0.6331846324581643, 0.6331846324581643, 0.6331846324581643, 0.6331846324581643, 0.6331846324581643, 0.6104715137236215, 0.6104715137236215, 0.6104715137236215, 0.6104715137236215, 0.6097339014617077, 0.5966972098007739, 0.5966972098007739, 0.5966972098007739, 0.5966972098007739, 0.5966972098007739]
Average F1 macro: 0.6178335613660024      language: Arabic
Language: English      Model loaded: Zero Shot      c: 0.01      seed: 2018

 F1 macro score for test: 0.31604195294361415

 F1 macro score for val: 0.3204008139747576

 Accuracy for test: 0.3165191166455881

 Accuracy for val: 0.32086720867208673
Language: English      Model loaded: Zero Shot   

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: English      Model loaded: Zero Shot      c: 10      seed: 2018

 F1 macro score for test: 0.5611631776729344

 F1 macro score for val: 0.5691635085674751

 Accuracy for test: 0.6403346629049519

 Accuracy for val: 0.6495934959349593


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: English      Model loaded: Zero Shot      c: 10      seed: 2019

 F1 macro score for test: 0.5610515079292167

 F1 macro score for val: 0.5692461777766092

 Accuracy for test: 0.6401887343126763

 Accuracy for val: 0.6495934959349593


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: English      Model loaded: Zero Shot      c: 10      seed: 2020

 F1 macro score for test: 0.5611631776729344

 F1 macro score for val: 0.5691635085674751

 Accuracy for test: 0.6403346629049519

 Accuracy for val: 0.6495934959349593


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: English      Model loaded: Zero Shot      c: 10      seed: 2021

 F1 macro score for test: 0.5612004033593648

 F1 macro score for val: 0.5691635085674751

 Accuracy for test: 0.6403833057690437

 Accuracy for val: 0.6495934959349593


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: English      Model loaded: Zero Shot      c: 10      seed: 2022

 F1 macro score for test: 0.5611631776729344

 F1 macro score for val: 0.5692332287131208

 Accuracy for test: 0.6403346629049519

 Accuracy for val: 0.649683830171635
[0.31604195294361415, 0.31604195294361415, 0.31604195294361415, 0.31604195294361415, 0.31604195294361415, 0.36034386109159466, 0.36034386109159466, 0.36034386109159466, 0.36034386109159466, 0.36034386109159466, 0.4858641607277278, 0.4858641607277278, 0.4858641607277278, 0.4858641607277278, 0.4858641607277278, 0.5611631776729344, 0.5610515079292167, 0.5611631776729344, 0.5612004033593648, 0.5611631776729344]
Average F1 macro: 0.4308495659061034      language: English
Language: French      Model loaded: Zero Shot      c: 0.01      seed: 2018

 F1 macro score for test: 0.44135714285714284

 F1 macro score for val: 0.5346320346320346

 Accuracy for test: 0.4430379746835443

 Accuracy for val: 0.5348837209302325
Language: French      Model loaded: Zero

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: French      Model loaded: Zero Shot      c: 1      seed: 2018

 F1 macro score for test: 0.3990001449065353

 F1 macro score for val: 0.4774802007133788

 Accuracy for test: 0.4092827004219409

 Accuracy for val: 0.4806201550387597


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: French      Model loaded: Zero Shot      c: 1      seed: 2019

 F1 macro score for test: 0.3990001449065353

 F1 macro score for val: 0.4774802007133788

 Accuracy for test: 0.4092827004219409

 Accuracy for val: 0.4806201550387597


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: French      Model loaded: Zero Shot      c: 1      seed: 2020

 F1 macro score for test: 0.3990001449065353

 F1 macro score for val: 0.4774802007133788

 Accuracy for test: 0.4092827004219409

 Accuracy for val: 0.4806201550387597


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: French      Model loaded: Zero Shot      c: 1      seed: 2021

 F1 macro score for test: 0.3990001449065353

 F1 macro score for val: 0.4774802007133788

 Accuracy for test: 0.4092827004219409

 Accuracy for val: 0.4806201550387597


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: French      Model loaded: Zero Shot      c: 1      seed: 2022

 F1 macro score for test: 0.3990001449065353

 F1 macro score for val: 0.4774802007133788

 Accuracy for test: 0.4092827004219409

 Accuracy for val: 0.4806201550387597


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: French      Model loaded: Zero Shot      c: 10      seed: 2018

 F1 macro score for test: 0.4161715693377771

 F1 macro score for val: 0.4448081469358065

 Accuracy for test: 0.42616033755274263

 Accuracy for val: 0.4496124031007752


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: French      Model loaded: Zero Shot      c: 10      seed: 2019

 F1 macro score for test: 0.41120681838788653

 F1 macro score for val: 0.4448081469358065

 Accuracy for test: 0.4219409282700422

 Accuracy for val: 0.4496124031007752


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: French      Model loaded: Zero Shot      c: 10      seed: 2020

 F1 macro score for test: 0.4062091503267974

 F1 macro score for val: 0.4448081469358065

 Accuracy for test: 0.4177215189873418

 Accuracy for val: 0.4496124031007752


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Language: French      Model loaded: Zero Shot      c: 10      seed: 2021

 F1 macro score for test: 0.4062091503267974

 F1 macro score for val: 0.4448081469358065

 Accuracy for test: 0.4177215189873418

 Accuracy for val: 0.4496124031007752
Language: French      Model loaded: Zero Shot      c: 10      seed: 2022

 F1 macro score for test: 0.4148148148148148

 F1 macro score for val: 0.4448081469358065

 Accuracy for test: 0.42616033755274263

 Accuracy for val: 0.4496124031007752
[0.44135714285714284, 0.44135714285714284, 0.44135714285714284, 0.44135714285714284, 0.44135714285714284, 0.436770848336454, 0.436770848336454, 0.436770848336454, 0.436770848336454, 0.436770848336454, 0.3990001449065353, 0.3990001449065353, 0.3990001449065353, 0.3990001449065353, 0.3990001449065353, 0.4161715693377771, 0.41120681838788653, 0.4062091503267974, 0.4062091503267974, 0.4148148148148148]
Average F1 macro: 0.42201260918473676      language: French


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
