>[Installation and Setup](#scrollTo=exY3is1ILr_F)

>[Data Loading and Validation](#scrollTo=P7mSZGr-Ln8U)

>[Statistics](#scrollTo=D1VzHuOKUkYU)

>[Create Merged Datasets](#scrollTo=5-QvhUDFdAxu)

>[Alignment](#scrollTo=2Hw9I84MgN7g)

>[Experiments](#scrollTo=SIs5sRIzc8Z3)

>>[CK](#scrollTo=tPLqugwWdKMk)

>>[CK Character Augmentation](#scrollTo=vb51VkkJeHzq)

>>[CK_E](#scrollTo=mDn8U6aUdOlT)

>>[CK_K](#scrollTo=L6ecz2UVdbki)

>>[CK_E_K](#scrollTo=Qp74868Ac6sj)

>>[CK Word Alignment Augmentation](#scrollTo=RXZO8ymieo_J)

>>[CK Translated Test Test](#scrollTo=BpI2vxmiZONa)



#Installation and Setup

In [12]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/CS769

Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1pyPDXXgFYu_gD-IDMCmfmchXQboVGhNs/CS769


In [None]:
%%capture
!pip install emoji transformers

#Data Loading and Validation

In [7]:
import pandas as pd
df_ck_train =  pd.read_csv("original/ck_train.csv", encoding='utf-8')
df_ck_test =  pd.read_csv("original/ck_test.csv", encoding='utf-8')
df_ck_dev =  pd.read_csv("original/ck_dev.csv", encoding='utf-8')
df_k_train = pd.read_csv("translated/k_train.csv", encoding='utf-8')
df_e_train = pd.read_csv("translated/e_train.csv", encoding='utf-8')

In [None]:
def validate(df):
  assert set(df['label'])=={"Offensive","Not_offensive"}
  assert df.isna().any().any()==False

In [None]:
validate(df_ck_train)
validate(df_ck_test)
validate(df_ck_dev)
validate(df_k_train)
validate(df_e_train)
assert len(df_k_train)==len(df_e_train)==len(df_ck_train)
print(len(df_k_train))

4695


#Statistics

In [None]:
def balance_stats(df):
  dupes=df['text'].value_counts()
  labels=df['label'].value_counts()
  return pd.Series(dict(
      n_duplicates=dupes[dupes>1].sum()-len(dupes[dupes>1]),
      offensive=labels['Offensive'],
      not_offensive=labels['Not_offensive'],
      total=labels.sum()
  ))

In [None]:
stats_original=pd.DataFrame(dict(
    ck_train=balance_stats(df_ck_train),
    ck_test=balance_stats(df_ck_test),
    ck_dev=balance_stats(df_ck_dev),
)).transpose()
stats_translated=pd.DataFrame(dict(
    k_train=balance_stats(df_k_train),
    e_train=balance_stats(df_e_train),
)).transpose()
print(stats_original)
stats_original.to_csv('stats/stats_original.csv')
print(stats_translated)
stats_translated.to_csv('stats/stats_translated.csv')

          n_duplicates  offensive  not_offensive  total
ck_train           174       1151           3544   4695
ck_test              5        166            427    593
ck_dev              14        160            426    586
         n_duplicates  offensive  not_offensive  total
k_train           216       1151           3544   4695
e_train           209       1151           3544   4695


In [None]:
import string
import unicodedata
  
def dominance_stats(df):

  unique_words = set()
  for sentence in df['text']:
      words = sentence.split()
      unique_words.update(words)
  unique_words = set(word for word in unique_words if not (all(char in string.punctuation for char in word) or word.isdigit()))
  
  def is_english_word(word):
      try:
        for char in word:
            if char.isalpha() or char.isspace():
                continue
            if 'Emoji' in unicodedata.name(char):
                continue
            if char in string.punctuation:
                continue
            return False
      except ValueError:
        return False
      return True
  english_words = set(filter(is_english_word, unique_words))
  return pd.Series(dict(
      e_dominance=len(english_words)/len(unique_words),
  ))

dominance_e  + dominance_k = 1

In [None]:
dominance_stats_original=pd.DataFrame(dict(
    ck_train=dominance_stats(df_ck_train),
    ck_test=dominance_stats(df_ck_test),
    ck_dev=dominance_stats(df_ck_dev),
)).transpose()
dominance_stats_translated=pd.DataFrame(dict(
    k_train=dominance_stats(df_k_train),
    e_train=dominance_stats(df_e_train),
)).transpose()
print(dominance_stats_original)
dominance_stats_original.to_csv('stats/dominance_stats_original.csv')
print(dominance_stats_translated)
dominance_stats_translated.to_csv('stats/dominance_stats_translated.csv')

          e_dominance
ck_train     0.628155
ck_test      0.621958
ck_dev       0.671467
         e_dominance
k_train     0.256410
e_train     0.974296


#Create Merged Datasets

In [None]:
df_ck_e_k_train=pd.concat([df_ck_train,df_k_train,df_e_train])
df_ck_k_train=pd.concat([df_ck_train,df_k_train])
df_ck_e_train=pd.concat([df_ck_train,df_e_train])

In [None]:
df_ck_e_k_train.to_csv('translated/ck_e_k_train.csv', index=False)
df_ck_k_train.to_csv('translated/ck_k_train.csv', index=False)
df_ck_e_train.to_csv('translated/ck_e_train.csv', index=False)

#Alignment

In [None]:
%%capture
# Install fast_align using apt-get
! sudo apt-get install -y cmake
! rm -rf fast_align
! git clone https://github.com/clab/fast_align.git
! cd fast_align && mkdir build && cd build && cmake .. && make
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

In [None]:
aligned_text_k2e = []
aligned_text_e2k = []
for k_sentence, e_sentence in zip(df_k_train["text"],df_e_train["text"]):
  k_sentence=' '.join(word_tokenize(k_sentence))
  e_sentence=' '.join(word_tokenize(e_sentence))
  aligned_text_k2e.append(k_sentence + ' ||| ' + e_sentence)
  aligned_text_e2k.append(e_sentence + ' ||| ' + k_sentence)

with open('aligned/aligned_text_k2e.txt', 'w') as f:
    f.write('\n'.join(aligned_text_k2e))
with open('aligned/aligned_text_e2k.txt', 'w') as f:
    f.write('\n'.join(aligned_text_e2k))

In [None]:
%%capture
!fast_align/build/fast_align -i aligned/aligned_text_k2e.txt -d -o -v > aligned/k2e_align.txt
!fast_align/build/fast_align -i aligned/aligned_text_e2k.txt -d -o -v > aligned/e2k_align.txt

In [None]:
def align_dict(direction):
  with open(f'aligned/{direction}_align.txt', 'r', encoding='utf-8') as f:
    alignments = [line.strip().split() for line in f]
  dictionary = list()
  for sentence in alignments:
    temp=dict()
    for pair in sentence:
      i, j = pair.split("-")
      i, j = int(i), int(j)
      temp[i]=j
    dictionary.append(temp)  
  return dictionary

In [None]:
dictionary_k2e=align_dict('k2e')
dictionary_e2k=align_dict('e2k')

In [None]:
import random
dominance_e=0.628155
def augment_aligned(df_l1_train,df_l2_train,dictionary,dominance):
  text=  [] 
  labels = []
  for l1_sentence,l2_sentence,label,dict_line  in zip(df_l1_train["text"],df_l2_train["text"],df_l1_train["label"],dictionary):
    sent=[]
    l1_sentence=word_tokenize(l1_sentence)
    l2_sentence=word_tokenize(l2_sentence)
    for index,word in enumerate(l1_sentence):
      if index in dict_line and random.random()<dominance:
        sent.append(l2_sentence[dict_line[index]])
      else:
        sent.append(word)
    labels.append(label)
    text.append(' '.join(sent))
  return pd.DataFrame(dict(text=text,label=labels))

In [None]:
df_ckwk2e_train=[]
df_ckwe2k_train=[]
for i in range(3):
  df_ckwk2e_train.append(augment_aligned(df_k_train,df_e_train, dictionary_k2e, dominance_e))
  df_ckwe2k_train.append(augment_aligned(df_e_train,df_k_train, dictionary_e2k, 1-dominance_e))
df_ckwk2e_train=pd.concat(df_ckwk2e_train)
df_ckwe2k_train=pd.concat(df_ckwe2k_train)
df_ckwk2e_train = df_ckwk2e_train.append(df_ck_train)
df_ckwe2k_train = df_ckwe2k_train.append(df_ck_train)
print(len(df_ckwk2e_train), len(df_ckwe2k_train))

In [None]:
print(df_ck_train.head())

                                                text          label
0  Tik tok alli jagala madtidralla adra baggenu o...  Not_offensive
1                          Movie rerelease madi plss  Not_offensive
2  Amazon prime alli bittidira....yella manele no...  Not_offensive
3  Guru sure news nanu tik tok dawn lod madeda ya...  Not_offensive
4  ಸುದೀಪ್ ಸರ್ ಅಂಡ್ ದರ್ಶನ್ ಸರ್ ಅವರಿಗೆ ಇರೋ ಫ್ಯಾನ್ಸ್...  Not_offensive


In [None]:
print(df_ckwk2e_train.head())

                                                text          label
0  ತಿಕ್ Tok ಅಲ್ಲಿ ಜಗಳ ಮಾಡ್ತಿದ್ರಲ್ಲ ಅದ್ರ ಬಗ್ಗೆನೂ ಒ...  Not_offensive
1                         Movie ರೇರೆಳೆಯಾಸೆ ಮಾಡಿ plss  Not_offensive
2  Amazon ಪ್ರೈಮ್ ಅಲ್ಲಿ Prime .... is there ನೋಡ್ತಾ...  Not_offensive
3  ಗುರು Nivs ನಿವ್ಸ್ ನಾನು ತಿಕ್ Tok ಡಾನ್ನ ಲೋಡ್ ಮಾಡಿ...  Not_offensive
4  Sudeep sir and ದರ್ಶನ್ ಸರ್ ಅವರಿಗೆ ಇರೋ ಫ್ಯಾನ್ಸ್ ...  Not_offensive


In [None]:
print(df_ckwe2k_train.head())

                                                text          label
0  Did you make ಒಂದು ವಿಡಿಯೋ about Tik Tok and bec...  Not_offensive
1                         Movie rerelease madi ಪ್ಲಸ್  Not_offensive
2                        Amazon .... is ನೋಡ್ತಾರೆ ...  Not_offensive
3  ಗುರು Sure Nivs When I ಡಾನ್ನ Tik Tok ಮಾಡಿದ ಯಾವಾ...  Not_offensive
4  ಸುದೀಪ್ ಸರ್ and ದರ್ಶನ್ ಸರ್ have Ero ಫ್ಯಾನ್ಸ್ fo...  Not_offensive


In [None]:
print(df_k_train.head())

                                                text          label
0  ತಿಕ್ ಟಾಕ್ ಅಲ್ಲಿ ಜಗಳ ಮಾಡ್ತಿದ್ರಲ್ಲ ಅದ್ರ ಬಗ್ಗೆನೂ ...  Not_offensive
1                         ಮೂವಿ ರೇರೆಳೆಯಾಸೆ ಮಾಡಿ ಪ್ಲಸ್  Not_offensive
2  ಅಮೆಜಾನ್ ಪ್ರೈಮ್ ಅಲ್ಲಿ ಬಿತ್ತಿದಿರಾ....ಎಲ್ಲ ಮನೇಲೆ ...  Not_offensive
3  ಗುರು ಸುರೆ ನಿವ್ಸ್ ನಾನು ತಿಕ್ ತೊಕ್ ಡಾನ್ನ ಲೋಡ್ ಮಾಡ...  Not_offensive
4  ಸುದೀಪ್ ಸರ್ ಅಂಡ್ ದರ್ಶನ್ ಸರ್ ಅವರಿಗೆ ಇರೋ ಫ್ಯಾನ್ಸ್...  Not_offensive


In [None]:
print(df_e_train.head())

                                                text          label
0  Did you make a video about Tik Tok and become ...  Not_offensive
1                          Movie rerelease madi plss  Not_offensive
2                           Amazon Prime is there...  Not_offensive
3  Guru Sure Nivs When I Loaded Tik Tok Don When ...  Not_offensive
4  Sudeep sir and Darshan sir have Ero fans follo...  Not_offensive


In [None]:
df_ckwk2e_train.to_csv('augmented/ckwk2e_train.csv', index=False)
df_ckwe2k_train.to_csv('augmented/ckwe2e_train.csv', index=False)

In [None]:
print(dominance_stats(df_ckwk2e_train))
print(dominance_stats(df_ckwe2k_train))

e_dominance    0.488936
dtype: float64
e_dominance    0.581811
dtype: float64


In [None]:
df_ckwk2e_train = []
e_dominance=0.628155
for k_sentence,e_sentence,label, e2k in zip(df_k_train["text"],df_e_train["text"],df_k_train["label"],dictionary_e2k):
  sent=[]
  k_sentence=word_tokenize(k_sentence)
  e_sentence=word_tokenize(e_sentence)
  for index,word in enumerate(e_sentence):
    if random.random()>e_dominance:
      sent.append(e_sentence[e2k[index]])
    else:
      sent.append(word)
  df_ckwk2e_train.append(' '.join(sent))
df_ckwk2e_train=pd.DataFrame(df_ckwk2e_train)



Plan for CKalignK2E

```
Start from a sentence from K-> (k, k_i)
for each word j in k:
  if roll dice with p (dominance_e):
    replace word with e[dictionaryK2E[k_i][j]]

dominance_e(CKalign)==dominance_e(CK)
```

 CKalignE2K



CKalignCK2CK

```
Start from a sentence in CK -> (ck, ck_i)
for each word j in ck:
  if word is english:
    if roll dice p=dominence_k:
      replace word with k[dictionaryE2K[ck_i][j]]
  same fr k

dominance_e(CKalign)==1-dominance_e(CK)
```



#Experiments


In [3]:
from sklearn.metrics import *
def show_metrics(folder_name):
      df=pd.read_csv(f'models/{folder_name}/test_pred.csv')
      y_true, y_pred=list(df['label']), list(df['pred'])
      accuracy = accuracy_score(y_true, y_pred)
      mf1Score = f1_score(y_true, y_pred, average='macro')
      f1Score  = f1_score(y_true, y_pred)
      fpr, tpr, _ = roc_curve(y_true, y_pred)
      area_under_c = auc(fpr, tpr)
      recallScore = recall_score(y_true, y_pred)
      precisionScore = precision_score(y_true, y_pred)
      return {"accuracy": accuracy, 'mF1Score': mf1Score, 'f1Score': f1Score, 'auc': area_under_c,'precision': precisionScore, 'recall': recallScore}

## CK

In [None]:
!python train_generic.py original/ck_train.csv original/ck_dev.csv original/ck_test.csv  models/ck/

In [8]:
show_metrics('ck')

{'accuracy': 0.8145025295109612,
 'mF1Score': 0.7636799698576935,
 'f1Score': 0.6540880503144654,
 'auc': 0.7570469230552186,
 'precision': 0.6842105263157895,
 'recall': 0.6265060240963856}

## CK Character Augmentation

In [None]:
!python train_generic.py augmented/cka_train.csv original/ck_dev.csv original/ck_test.csv  models/cka/

In [9]:
show_metrics('cka')

{'accuracy': 0.7976391231028668,
 'mF1Score': 0.7471431008016374,
 'f1Score': 0.6341463414634146,
 'auc': 0.7453373211816822,
 'precision': 0.6419753086419753,
 'recall': 0.6265060240963856}

## CK_E

In [None]:
!python train_generic.py translated/ck_e_train.csv original/ck_dev.csv original/ck_test.csv  models/ck_e/

In [10]:
show_metrics('ck_e')

{'accuracy': 0.8293918918918919,
 'mF1Score': 0.7806788128661192,
 'f1Score': 0.6773162939297125,
 'auc': 0.7711550427060354,
 'precision': 0.7210884353741497,
 'recall': 0.6385542168674698}

##CK_K


In [None]:
!python train_generic.py translated/ck_k_train.csv original/ck_dev.csv original/ck_test.csv  models/ck_k/

In [11]:
show_metrics('ck_k')

{'accuracy': 0.8246205733558178,
 'mF1Score': 0.7689910401246591,
 'f1Score': 0.6556291390728476,
 'auc': 0.754867244152253,
 'precision': 0.7279411764705882,
 'recall': 0.5963855421686747}

## CK_E_K

In [None]:
!python train_generic.py translated/ck_e_k_train.csv  original/ck_dev.csv original/ck_test.csv  models/ck_e_k/

In [12]:
show_metrics('ck_e_k')

{'accuracy': 0.8361486486486487,
 'mF1Score': 0.7820216444793671,
 'f1Score': 0.6734006734006734,
 'auc': 0.7648198427512868,
 'precision': 0.7633587786259542,
 'recall': 0.6024096385542169}

## CK Word Alignment Augmentation

In [None]:
!python train_generic.py augmented/ckwk2e_train.csv  original/ck_dev.csv original/ck_test.csv  models/ckwk2e/

In [None]:
show_metrics('ckwk2e')

In [None]:
!python train_generic.py augmented/ckwe2k_train.csv  original/ck_dev.csv original/ck_test.csv  models/ckwe2k/

In [16]:
show_metrics('ckwe2k')

{'accuracy': 0.8010118043844857,
 'mF1Score': 0.7592416735480318,
 'f1Score': 0.6589595375722543,
 'auc': 0.7660901216105641,
 'precision': 0.6333333333333333,
 'recall': 0.6867469879518072}

## CK Translated Test Test  

In [None]:
!cp -R models/ck_e models/translatedtest_ck_e
!cp -R models/ck_k models/translatedtest_ck_k

In [None]:
!python train_generic.py translated/ck_k_train.csv  translated/k_dev.csv translated/k_test.csv  models/translatedtest_ck_k/

In [17]:
show_metrics('translatedtest_ck_k')

{'accuracy': 0.806070826306914,
 'mF1Score': 0.7657599791160742,
 'f1Score': 0.6685878962536024,
 'auc': 0.7732851781834599,
 'precision': 0.6408839779005525,
 'recall': 0.6987951807228916}

In [None]:
!python train_generic.py translated/ck_e_train.csv  translated/e_dev.csv translated/e_test.csv  models/translatedtest_ck_e/

In [None]:
show_metrics('translatedtest_ck_e')

{'accuracy': 0.7672849915682968,
 'mF1Score': 0.7313041765169426,
 'f1Score': 0.6329787234042553,
 'auc': 0.7518763578905787,
 'precision': 0.5666666666666667,
 'recall': 0.7168674698795181}

CK Train -> K test 

In [None]:
!cp -r models/ck models/ck_test_kannada 

In [None]:
!python test_generic.py original/ck_train.csv  translated/k_dev.csv translated/k_test.csv  models/ck_test_kannada/

In [None]:
show_metrics('ck_test_kannada')

{'accuracy': 0.7689713322091062,
 'mF1Score': 0.7062583838624884,
 'f1Score': 0.5705329153605015,
 'auc': 0.7014968539262437,
 'precision': 0.5947712418300654,
 'recall': 0.5481927710843374}

In [None]:
!cp -r models/ck models/ck_test_english 

In [None]:
!python test_generic.py original/ck_train.csv  translated/e_dev.csv translated/e_test.csv  models/ck_test_english/

In [None]:
show_metrics('ck_test_english')

{'accuracy': 0.7841483979763912,
 'mF1Score': 0.728223196127295,
 'f1Score': 0.6049382716049382,
 'auc': 0.7249231116503485,
 'precision': 0.620253164556962,
 'recall': 0.5903614457831325}

In [None]:
!cp -r models/ck_e_k models/ck_e_k_test_english

In [None]:
!python test_generic.py augmented/ck_e_k_train.csv  translated/e_dev.csv translated/e_test.csv  models/ck_e_k_test_english/

In [None]:
!cp -r models/ck_e_k models/ck_e_k_test_kannada

In [None]:
!python test_generic.py augmented/ck_e_k_train.csv  translated/k_dev.csv translated/k_test.csv  models/ck_e_k_test_kannada/

In [None]:
!cp -r models/ck_k models/ck_k_test_kannada

In [None]:
!python test_generic.py augmented/ck_k_train.csv  translated/k_dev.csv translated/k_test.csv  models/ck_k_test_kannada/

In [None]:
!cp -r models/ck_e models/ck_e_test_english

In [None]:
!python test_generic.py augmented/ck_e_train.csv  translated/k_dev.csv translated/k_test.csv  models/ck_e_test_english/

In [None]:
!cp -r models/ck_e models/ck_e_test_english