<a href="https://colab.research.google.com/github/rfahrn/Shared_Task/blob/main/Shared_Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
import pandas as pd
train = pd.read_csv('./data/offenseval-ar-training-v1.tsv', sep='\t')

In [None]:
train.head(5)

In [None]:
# !pip install emoji
# !pip install DSAraby 
# !pip install Tashaphyne

In [10]:
# Preprocessing Arabic Tweets
import emoji

def avg_word(sentence):
    words = sentence.split()
    if len(words) == 0:
        return 0
    return sum(len(word) for word in words) / len(words)

def emoji_counter(sentence):
    return emoji.emoji_count(sentence)


train['word_count'] = train['tweet'].apply(lambda x: len(str(x).split(" ")))
train['char_count'] = train['tweet'].str.len() ## this also includes spaces
train['avg_char_per_word'] = train['tweet'].apply(lambda x: avg_word(x))
train['emoji_count'] = train['tweet'].apply(lambda x: emoji_counter(x))
train = train.sort_values(by='word_count',ascending=[0])
train.head()

Unnamed: 0,id,tweet,subtask_a,word_count,char_count,avg_char_per_word,emoji_count
2922,2946,<LF>من المظاهر المؤسفه ان الاب والام <LF>يقعدو...,NOT,941,5857,4.403137,74
6062,6160,<LF>جده الحُب ،،<LF>جده يا مُنتهى كل الكلام يا...,NOT,862,5416,4.437751,36
4761,4833,"يا مالي عمري رضا يا شبيه الورد , يا معنى الحي...",NOT,531,3178,4.226974,108
6564,6707,يا من أظهر الجميل.. وستر القبيح.. يا من لا يؤا...,NOT,367,2283,4.397163,8
2753,2760,يا مُكعب السُكرَ يا زمُردي الأحمر و يأسريُ ال...,NOT,363,2129,4.165049,34


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
sns.countplot(train.subtask_a)
plt.title('count NOT/OFF') # much mor not than offendent tweets

In [None]:
# first we define a list of arabic and english punctiations that we want to get rid of in our text
import string
import re
punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ''' + string.punctuation

arabic_diacritics = re.compile("""
                             ّ    | # Shadda
                             َ    | # Fatha
                             ً    | # Tanwin Fath
                             ُ    | # Damma
                             ٌ    | # Tanwin Damm
                             ِ    | # Kasra
                             ٍ    | # Tanwin Kasr
                             ْ    | # Sukun
                             ـ     # Tatwil/Kashida
                         """, re.VERBOSE)

def preprocess(text):
    """
    text is an arabic string input

    the preprocessed text is returned
    """
    
    #remove punctuations
    translator = str.maketrans('', '', punctuations)
    text = text.translate(translator)
    
    # remove Tashkeel
    text = re.sub(arabic_diacritics, '', text)
    
    #remove longation
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)

    text = ' '.join(word for word in text.split())

    return text
  
train['tweet'] = train['tweet'].apply(preprocess)
print(train.head(5))

In [None]:
# removing emojis 
def remove_emoji(string):
    return emoji.get_emoji_regexp().sub(u'', string)

# removing USER
def user_remove(string):
  string = re.sub('(USER)', '', string)
  return string

train['tweet'] = train['tweet'].apply(remove_emoji)
train['tweet'] = train['tweet'].apply(user_remove)
train.head(40)

In [None]:
from sklearn.model_selection import train_test_split
data = train
X = data.tweet.values
y = data.subtask_a.values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=2020)


In [32]:
# removing emojis 
def remove_emoji(string):
    return emoji.get_emoji_regexp().sub(u'', string)

# removing USER
def user_remove(string):
  string = re.sub('(USER)', '', string)
  return string
train['tweet'] = train['tweet'].apply(remove_emoji)
train['tweet'] = train['tweet'].apply(user_remove)
train.head(40)

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,tweet,subtask_a,word_count,char_count,avg_char_per_word,stopwords,emoji_count
2922,2946,LFمن المظاهر المءسفه ان الاب والام LFيقعدون يت...,NOT,941,5857,4.403137,211,74
6062,6160,LFجده الحب LFجده منتهي الكلام سيده المدن URL ...,NOT,862,5416,4.437751,203,36
4761,4833,مالي عمري رضا شبيه الورد معني الحياه NOT 4834 ...,NOT,531,3178,4.226974,147,108
6564,6707,اظهر الجميل وستر القبيح يءاخذ بالجريره يهتك ال...,NOT,367,2283,4.397163,102,8
2753,2760,مكعب السكر زمردي الاحمر ياسري الاكبر فتنتي الع...,NOT,363,2129,4.165049,79,34
2124,2125,اللهم اني اسالك مساله الباءس الفقير وادعوك دعا...,OFF,179,1113,4.673469,36,3
5316,5414,رنا رنا رنا رنا رنا رنا رنا رنا رنا رنا رنا رن...,NOT,84,279,2.333333,56,0
6428,6571,باي باي باي باي باي باي باي باي باي باي باي با...,NOT,78,272,2.5,39,0
4030,4102,فهد باص فهد باص فهد باص فهد باص فهد باص فهد با...,OFF,74,276,2.743243,24,0
3777,3849,رب اتسفه رب اتسفه رب اتسفه رب اتسفه رب اتسفه ر...,NOT,73,278,2.821918,36,0


In [None]:

from sklearn.model_selection import train_test_split
data = train
X = data.tweet.values
y = data.subtask_a.values

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=2020)
