In [None]:
%cd /content/drive/MyDrive/research/PROJ201 20221/Detect English Text

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('all_annotated.tsv', delimiter='\t')

In [None]:
data_en = data.loc[data['Definitely English']==1]
data_en

In [None]:
data_non_en = data.loc[data['Definitely Not English']==1]
data_non_en

In [None]:
data = pd.concat([data_en, data_non_en]).reset_index(drop=True)
data

In [None]:
data = data[['Tweet', 'Definitely English', 'Definitely Not English']]
data

In [None]:
def define_label(definitely_en, definitely_non_en):
  if definitely_en==1:
    return 'en'
  else:
    return 'non-en'

In [None]:
data['label'] = data.apply(lambda row : define_label(row['Definitely English'], row['Definitely Not English']), axis=1)
data

In [None]:
del data['Definitely English'], data['Definitely Not English']
data

In [None]:
import re
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642"
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [None]:
import string
def remove_punctuations(text):
  new_text = ''
  for ch in text:
    if not (ch in string.punctuation):
      new_text += ch
  return new_text

In [None]:
def clean_tweets(tweet):
  # Remove Emoji
  tweet = remove_emojis(tweet)
  # Remove tags
  tweet = re.sub("@[A-Za-z0-9_]+","", tweet)
  # Remove hashtags
  tweet = re.sub("#[A-Za-z0-9_]+","", tweet)
  # Remove links
  tweet = re.sub(r'http\S+', '', tweet)
  # Remove new line
  tweet = re.sub('\n+', ' ', tweet)
  # tweet = re.sub('\n', ' ', tweet)
  # Remove emails
  tweet = re.sub(r'[a-zA-Z\d#!%\$‘&\+\*–/=\?\^_`\.\{\|\}~]+@[a-zA-Z\d]+\.[a-zA-Z\.]+', "", tweet)
  # Remove punctuation
  tweet = remove_punctuations(tweet)
  # Remove Repeated spaces
  tweet = re.sub(' +', ' ', tweet)
  tweet = tweet.strip()
  return tweet

In [None]:
data['Tweet'] = data.apply(lambda row : clean_tweets(row['Tweet']), axis=1)
data

In [None]:
data['delete'] = data.apply(lambda row : len(row['Tweet'])==0, axis=1)
data

In [None]:
data = data.loc[data['delete']==False]
del data['delete']
data

In [None]:
print('Number of tweets:', len(data))
print('Percentage of English Tweets:', (data['label']=='en').sum()/len(data))
print('Percentage of Non-English Tweets:', (data['label']=='non-en').sum()/len(data))

In [None]:
!pip install langdetect

In [None]:
from langdetect import DetectorFactory, detect
from langdetect.lang_detect_exception import LangDetectException
DetectorFactory.seed = 7

def langdetect_predict(txt):
    try:
        if detect(txt) != "en":
            return 'non-en'
    except LangDetectException:
        return 'non-en'
    return 'en'

In [None]:
data['langdetect_pred'] = data.apply(lambda row : langdetect_predict(row['Tweet']), axis=1)
data

In [None]:
! pip install pycld2

In [None]:
import regex
def remove_bad_chars(text):
  RE_BAD_CHARS = regex.compile(r"[\p{Cc}\p{Cs}]+")
  return RE_BAD_CHARS.sub("", text)

In [None]:
import pycld2 as cld2
def pycld2_predict(txt):
  txt = remove_bad_chars(txt)
  isReliable, textBytesFound, details, vectors = cld2.detect(txt, returnVectors=True)
  total_score = 0
  enslish_score = 0
  for v in vectors:
    l = v[2].lower()
    if l == 'english':
      enslish_score += v[1]
    total_score += v[1]

  score = enslish_score/total_score if total_score>0 else 0
  pred_lang = 'en' if score > 0.8 else 'non-en'
  # score = score if pred_lang=='en' else (1-score)
  return pred_lang

In [None]:
data['pycld2_pred'] = data.apply(lambda row : pycld2_predict(row['Tweet']), axis=1)
data

In [None]:
!pip install langid

In [None]:
import langid
def langid_predict(txt):
    pred = langid.classify(txt)[0]
    if pred=='en':
      return 'en'
    else:
      return 'non-en'

In [None]:
data['langid_pred'] = data.apply(lambda row : langid_predict(row['Tweet']), axis=1)

In [None]:
data

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [None]:
cm = confusion_matrix(data['label'], data['pycld2_pred'],labels=["en", "non-en"])
print(cm)
print(classification_report(data['label'], data['pycld2_pred'],labels=["en", "non-en"]))

In [None]:
cm = confusion_matrix(data['label'], data['langid_pred'],labels=["en", "non-en"])
print(cm)
print(classification_report(data['label'], data['langid_pred'],labels=["en", "non-en"]))

In [None]:
# Find the conusion matix, precision, recall and accuray for langdetect prediction