In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install laserembeddings

Collecting laserembeddings
  Downloading https://files.pythonhosted.org/packages/a2/4b/a9e3ee9f4825bd2bb6b48f26370e2c341860ec0cb2a9a27deea9be6c2299/laserembeddings-1.1.0-py3-none-any.whl
Collecting sacremoses==0.0.35
[?25l  Downloading https://files.pythonhosted.org/packages/1f/8e/ed5364a06a9ba720fddd9820155cc57300d28f5f43a6fd7b7e817177e642/sacremoses-0.0.35.tar.gz (859kB)
[K     |████████████████████████████████| 860kB 4.0MB/s 
[?25hCollecting transliterate==1.10.2
[?25l  Downloading https://files.pythonhosted.org/packages/a1/6e/9a9d597dbdd6d0172427c8cc07c35736471e631060df9e59eeb87687f817/transliterate-1.10.2-py2.py3-none-any.whl (45kB)
[K     |████████████████████████████████| 51kB 6.8MB/s 
Collecting subword-nmt<0.4.0,>=0.3.6
  Downloading https://files.pythonhosted.org/packages/74/60/6600a7bc09e7ab38bc53a48a20d8cae49b837f93f5842a41fe513a694912/subword_nmt-0.3.7-py2.py3-none-any.whl
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ..

In [None]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.6/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [None]:
import pandas as pd
import numpy as np
import os
from pandas import DataFrame
from laserembeddings import Laser
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import pickle

In [None]:
lang = ['Arabic','English','French']
cols = []
for i in range(1024):
  cols.append(i)

In [None]:
def preprocessing(language):
    path = 'drive/My Drive/CS695'
    data = pd.read_csv(os.path.join(path,language+'.csv'))
    data_to_list = data['tweet'].to_list()

    laser = Laser()
    embeddings = laser.embed_sentences(data_to_list, lang='en') 

    df1=pd.DataFrame(embeddings)
    df2 = pd.DataFrame(data['label'])

    sss1 = StratifiedShuffleSplit(n_splits=1,test_size=0.3,random_state=0)
    a = list(sss1.split(df1, df2))
    train_x,interX = df1.iloc[a[0][0],:],df1.iloc[a[0][1],:]
    train_y,interY = df2.iloc[a[0][0],:],df2.iloc[a[0][1],:]

    sss2 = StratifiedShuffleSplit(n_splits=1,test_size=0.35,random_state=0)
    a = list(sss2.split(interX, interY))
    test_x,val_x = interX.iloc[a[0][0],:],interX.iloc[a[0][1],:]
    test_y,val_y = interY.iloc[a[0][0],:],interY.iloc[a[0][1],:]

    return train_x, test_x, val_x, train_y, test_y, val_y 

In [None]:
Arabic_train_x, Arabic_test_x, Arabic_val_x, Arabic_train_y, Arabic_test_y, Arabic_val_y = preprocessing('Arabic')
English_train_x, English_test_x, English_val_x, English_train_y, English_test_y, English_val_y = preprocessing('English')
French_train_x, French_test_x, French_val_x, French_train_y, French_test_y, French_val_y = preprocessing('French')

In [None]:
_English = pd.concat([English_train_x, English_train_y], axis=1)
_Arabic = pd.concat([Arabic_train_x, Arabic_train_y], axis=1)
_French = pd.concat([French_train_x, French_train_y], axis=1)

In [None]:
df_ = pd.concat([_English, _Arabic], axis=0)
df_ = pd.concat([df_, _French], axis= 0)

In [None]:
def model_for_all(c, seed):
  
  df_new = df_.sample(frac=1,random_state=seed).reset_index(drop=True)
            
  df_x = df_new[cols]
  df_y = df_new['label']

  model = LogisticRegression(C=c,solver='lbfgs',class_weight='balanced',random_state=seed)
  model.fit(df_x, df_y)
    
  Arabic_test_predict = model.predict(Arabic_test_x)
  French_test_predict = model.predict(French_test_x)
  English_test_predict = model.predict(English_test_x)
 

  Arabic_f1_score = f1_score(Arabic_test_y, Arabic_test_predict, average='macro')
  French_f1_score = f1_score(French_test_y, French_test_predict, average='macro')
  English_f1_score = f1_score(English_test_y, English_test_predict, average='macro')
 

  #saved_model = pickle.dumps(model) 

  print('c:',c,'    ','seed:',seed)
  print('\n F1 macro score for test Arabic:', Arabic_f1_score)
  print('\n F1 macro score for test French:', French_f1_score)
  print('\n F1 macro score for test English:', English_f1_score)
 

  return Arabic_f1_score, French_f1_score, English_f1_score

In [None]:
Arabic_f1_list = []
English_f1_list = []
French_f1_list = []

for c in [0.01,0.1,1,10]:
  for seeds in [2018,2019,2020,2021,2022]:
    seed = seeds
    np.random.seed(seed)
    a, b, c = model_for_all(c,seed)
    Arabic_f1_list.append(a)
    French_f1_list.append(b)
    English_f1_list.append(c)
  
print('Scores for Arabic:', Arabic_f1_list)
print('Scores for English:', English_f1_list)
print('Scores for French:', French_f1_list)

Arabic_avg = sum(Arabic_f1_list)/len(Arabic_f1_list)
English_avg = sum(English_f1_list)/len(English_f1_list)
French_avg = sum(French_f1_list)/len(French_f1_list)

print('Average F1 macro for Arabic:',Arabic_avg)
print('Average F1 macro for English:',English_avg)
print('Average F1 macro for French:',French_avg)

c: 0.01      seed: 2018

 F1 macro score for test Arabic: 0.6545133865978234

 F1 macro score for test French: 0.43556271823798015

 F1 macro score for test English: 0.6233345432330414


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6233345432330414      seed: 2019

 F1 macro score for test Arabic: 0.6904171638473597

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6841667420460402


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6841667420460402      seed: 2020

 F1 macro score for test Arabic: 0.6909412642933954

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6849900179626853


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6849900179626853      seed: 2021

 F1 macro score for test Arabic: 0.6917354259083326

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6849544960146576


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6849544960146576      seed: 2022

 F1 macro score for test Arabic: 0.6917354259083326

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6849107060334678
c: 0.1      seed: 2018

 F1 macro score for test Arabic: 0.6814645551834045

 F1 macro score for test French: 0.43198241219613287

 F1 macro score for test English: 0.6662242279065476


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6662242279065476      seed: 2019

 F1 macro score for test Arabic: 0.6917354259083326

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6850338118330248


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6850338118330248      seed: 2020

 F1 macro score for test Arabic: 0.6925303572681969

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6851214087316129


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6851214087316129      seed: 2021

 F1 macro score for test Arabic: 0.6917354259083326

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.685165211762468


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.685165211762468      seed: 2022

 F1 macro score for test Arabic: 0.6909412642933954

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6850858842753499


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 1      seed: 2018

 F1 macro score for test Arabic: 0.6927719376724073

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6866457682716585


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6866457682716585      seed: 2019

 F1 macro score for test Arabic: 0.6917354259083326

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.685165211762468


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.685165211762468      seed: 2020

 F1 macro score for test Arabic: 0.6909412642933954

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6852528269938207


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6852528269938207      seed: 2021

 F1 macro score for test Arabic: 0.6917354259083326

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6850338118330248


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6850338118330248      seed: 2022

 F1 macro score for test Arabic: 0.6925303572681969

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6851214087316129


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 10      seed: 2018

 F1 macro score for test Arabic: 0.6914612910055674

 F1 macro score for test French: 0.41861471861471866

 F1 macro score for test English: 0.6909668909824628


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6909668909824628      seed: 2019

 F1 macro score for test Arabic: 0.6909412642933954

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6852090178494359


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6852090178494359      seed: 2020

 F1 macro score for test Arabic: 0.6909412642933954

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6852090178494359


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


c: 0.6852090178494359      seed: 2021

 F1 macro score for test Arabic: 0.6917354259083326

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6848396171271781
c: 0.6848396171271781      seed: 2022

 F1 macro score for test Arabic: 0.6909412642933954

 F1 macro score for test French: 0.4100649350649351

 F1 macro score for test English: 0.6852883383906407
Scores for Arabic: [0.6545133865978234, 0.6904171638473597, 0.6909412642933954, 0.6917354259083326, 0.6917354259083326, 0.6814645551834045, 0.6917354259083326, 0.6925303572681969, 0.6917354259083326, 0.6909412642933954, 0.6927719376724073, 0.6917354259083326, 0.6909412642933954, 0.6917354259083326, 0.6925303572681969, 0.6914612910055674, 0.6909412642933954, 0.6909412642933954, 0.6917354259083326, 0.6909412642933954]
Scores for English: [0.6233345432330414, 0.6841667420460402, 0.6849900179626853, 0.6849544960146576, 0.6849107060334678, 0.6662242279065476, 0.6850338118330248, 0.6851214087316129, 0.6

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
