In [1]:
import pandas as pd
import re
import string

In [2]:
arabic_punctuations = '''`รทรุ<>_()*&^%][ูุ/:"ุ.,'{}~ยฆ+|!โโฆโโู'''
english_punctuations = string.punctuation
punctuations_list = arabic_punctuations + english_punctuations

arabic_diacritics = re.compile("""
                             ู    | # Tashdid
                             ู    | # Fatha
                             ู    | # Tanwin Fath
                             ู    | # Damma
                             ู    | # Tanwin Damm
                             ู    | # Kasra
                             ู    | # Tanwin Kasr
                             ู    | # Sukun
                             ู     # Tatwil/Kashida
                             """, re.VERBOSE)

def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

def normalize_arabic(text):
    text = re.sub("[ุฅุฃุขุง]", "ุง", text)
    text = re.sub("ู", "ู", text)
    text = re.sub("ุค", "ุก", text)
    text = re.sub("ุฆ", "ุก", text)
    text = re.sub("ุฉ", "ู", text)
    text = re.sub("ฺฏ", "ู", text)   # i keep this because this one contains useful feature as specially iraq uses this character
    return text

def remove_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def remove_repeating_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

def clean_text(text):
    t = re.sub(r'#', ' ', text) # replace '#' with space
    t = re.sub(r'_', ' ', t)    # replace '_' with space
    t = re.sub(r'[^\u0600-\u06FF]', ' ', t) # remove all characters except arabic ones
    t = remove_diacritics(t) 
    t = remove_punctuations(t) # remove the remained punctuations, actually the remains are only """ ?,; """
    t = normalize_arabic(t)    # replace speacial arabic characters with some how general ones
    t = re.sub(r' +', ' ', t)  # remove multiple spaces, also can done with re.sub(r'/\s\s+/g', ' ', text)
    t = remove_repeating_char(t)
    return t
    

In [3]:
raw_df = pd.read_csv("../csv_files/raw_txt_df.csv", index_col="id")
raw_df

Unnamed: 0_level_0,raw_txt
id,Unnamed: 1_level_1
1175358310087892992,@Nw8ieJUwaCAAreT ููู ุจุงูููุงูุฉ .. ููุชูุถ .. ูุบูุฑ .
1175416117793349632,@7zNqXP0yrODdRjK ูุนูู ูุฐุง ูุญุณูุจ ุนูู ุงูุจุดุฑ .. ุญ...
1175450108898565888,@KanaanRema ูุจูู ูู ููุงูู ุฎููุฌู
1175471073770573824,@HAIDER76128900 ูุณูููู ูุฑูุฑู ูุฑูุญู ุงูุญููู๐
1175496913145217024,@hmo2406 ููู ูู ุงูุบูุจู ุงุฎ ูุญูุฏ ๐ธ๐บ
...,...
1019484980282580992,@Al_mhbaa_7 ูุจุณูุทูู ููู ุงููู ุจุงุณุทุงูุง๐
1021083283709407232,@Zzainabali @P_ameerah ูุงููู ูุงููุฏู ุงุจุด ูุฎุชู
1017477537889431552,@Al_mhbaa_7 ุดู ุนูููุง ูู ุญูุง ุชูุฑุจู ูููุง ุงุญูุง ูุณ...
1022430374696239232,@haneenalmwla ุงููู ูุจุงุฑู ูููุง ูุจุงูุนุงููู ๐๐๐


In [4]:
raw_df["txt"] = [clean_text(text) for text in raw_df["raw_txt"]]
raw_df

Unnamed: 0_level_0,raw_txt,txt
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1175358310087892992,@Nw8ieJUwaCAAreT ููู ุจุงูููุงูุฉ .. ููุชูุถ .. ูุบูุฑ .,ููู ุจุงูููุงูู ููุชูุถ ูุบูุฑ
1175416117793349632,@7zNqXP0yrODdRjK ูุนูู ูุฐุง ูุญุณูุจ ุนูู ุงูุจุดุฑ .. ุญ...,ูุนูู ูุฐุง ูุญุณูุจ ุนูู ุงูุจุดุฑ ุญูููู ูุญุดูู ูุชุทูุจูู ...
1175450108898565888,@KanaanRema ูุจูู ูู ููุงูู ุฎููุฌู,ูุจูู ูู ููุงูู ุฎููุฌู
1175471073770573824,@HAIDER76128900 ูุณูููู ูุฑูุฑู ูุฑูุญู ุงูุญููู๐,ูุณูููู ูุฑูุฑู ูุฑูุญู ุงูุญููู
1175496913145217024,@hmo2406 ููู ูู ุงูุบูุจู ุงุฎ ูุญูุฏ ๐ธ๐บ,ููู ูู ุงูุบูุจู ุงุฎ ูุญูุฏ
...,...,...
1019484980282580992,@Al_mhbaa_7 ูุจุณูุทูู ููู ุงููู ุจุงุณุทุงูุง๐,ูุจุณูุทูู ููู ุงูู ุจุงุณุทุงูุง
1021083283709407232,@Zzainabali @P_ameerah ูุงููู ูุงููุฏู ุงุจุด ูุฎุชู,ูุงูู ูุงููุฏู ุงุจุด ูุฎุชู
1017477537889431552,@Al_mhbaa_7 ุดู ุนูููุง ูู ุญูุง ุชูุฑุจู ูููุง ุงุญูุง ูุณ...,ุดู ุนูููุง ูู ุญูุง ุชูุฑุจู ููุง ุงุญูุง ูุณุงููู ููุด ุจุชุน...
1022430374696239232,@haneenalmwla ุงููู ูุจุงุฑู ูููุง ูุจุงูุนุงููู ๐๐๐,ุงูู ูุจุงุฑู ูููุง ูุจุงูุนุงููู


In [5]:
raw_df["txt"].to_csv("../csv_files/cleaned_txt_df.csv")

In [6]:
# pd.read_csv("../csv_files/cleaned_txt_df.csv", index_col="id")

Unnamed: 0_level_0,txt
id,Unnamed: 1_level_1
1175358310087892992,ููู ุจุงูููุงูู ููุชูุถ ูุบูุฑ
1175416117793349632,ูุนูู ูุฐุง ูุญุณูุจ ุนูู ุงูุจุดุฑ ุญูููู ูุญุดูู ูุชุทูุจูู ...
1175450108898565888,ูุจูู ูู ููุงูู ุฎููุฌู
1175471073770573824,ูุณูููู ูุฑูุฑู ูุฑูุญู ุงูุญููู
1175496913145217024,ููู ูู ุงูุบูุจู ุงุฎ ูุญูุฏ
...,...
1019484980282580992,ูุจุณูุทูู ููู ุงูู ุจุงุณุทุงูุง
1021083283709407232,ูุงูู ูุงููุฏู ุงุจุด ูุฎุชู
1017477537889431552,ุดู ุนูููุง ูู ุญูุง ุชูุฑุจู ููุง ุงุญูุง ูุณุงููู ููุด ุจุชุน...
1022430374696239232,ุงูู ูุจุงุฑู ูููุง ูุจุงูุนุงููู
