In [8]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords

import sklearn
import pickle

from tensorflow.keras.models import load_model
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.metrics import Accuracy,Recall,Precision
from spacy.lang.fr.stop_words import STOP_WORDS as fr_stop
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tag import pos_tag

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rasta\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [9]:
UP = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
LOW = "abcdefghijklmnopqrstuvwxyz"
LATIN = "Ààéêèç"
SPACE = '\t\n\r\v\f'
fr_stopwords=stopwords.words('french')

LETTERS = 'ابتةثجحخدذرزسشصضطظعغفقكلمنهويءآأؤإؤ'

#all_latin_chars = UP + LOW + LATIN

punctuations = string.punctuation

punctuations_list =  punctuations + SPACE

french_punctuations_list= punctuations_list.replace("'","")

#Full List of emogies to be removed
emoji_pattern = re.compile("["
                       #u"\U0001F600-\U0001F64F"  # emoticons to keep 
                       u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                       u"\U0001F680-\U0001F6FF"  # transport & map symbols
                       u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251"
                       u"\U0001f932"
                       u"\u200f"
                       u"\U0001F914"
                       u"\U0001F923"
                       u"\u200D"
                       u"\u202c"
                       u"\u2069"
                       u"\u2066"
                       u"\U0001F926"
                       u"\U0001F917"
                       u"\U0001f928"
                       u"\t"
                       u"\u200e"
                       "]+", flags=re.UNICODE)

In [4]:
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [10]:
#Remove small words 
def remove_minlen_word(text, threshold):
    return " ".join([w for w in text.split() if len(w) > threshold])

#lower text
def lower(text):
    return text.lower()

#Remove urls
def remove_url(text):
    text = re.sub('http[s]?://\S+', '', text, flags=re.MULTILINE)
    text = re.sub('ftp?://\S+', '', text, flags=re.MULTILINE)
    text = re.sub('www.\S+', '', text, flags=re.MULTILINE)
    return text


#remove tags @ and hashtags #
def remove_prefix(text, prefix):  # mainly used to remove hashtags ans mensions
    purged=[]
    output=""
    for word in text.split():
        if not word.startswith(prefix):
            purged.append(word)
    return " ".join(purged)

# Remove punctuation
def remove_french_punctuations(text):
    translator = str.maketrans('', '', french_punctuations_list)
    return text.translate(translator)

#remove french stop words, here we specified just Fr stop words, you can add other DZ word that you juge as stop words 
def remove_fr_stop_words(text):
    temp_text=[]
    for word in text.split(" "):
        if word not in fr_stopwords:
            temp_text.append(word)
    return ' '.join(temp_text)

# Remove repetting letters 
def remove_repeating_char(text):
    result=[]
    for word in text.split(" "):
        temp=re.sub(r'(.)\1+', r'\1', word)
        if len(temp)==1:
             temp=temp+temp
        result.append( temp)
    return " ".join(result)

# remove emogies 
def remove_emoji(text):
    return emoji_pattern.sub(r' ' , text)

# remove extra stop words 
def remove_extra_white_spaces(text):
    text= ' '.join(text.split())
    return text

# remove digits delimited with white space
def remove_digits(text):
    text= re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", text)
    return text

# specify a list of generic words to be removed if you want =W we can add here the arabizi stop words 
def remove_words_generic(text,list_words=[]):
    purged=[]
    for word in text.split(" "):
        if word not in list_words:
            purged.append(word)
    return " ".join(purged)


#remove all arabic characters from a text
def remove_arabic_chars(text):
    new_text = ""
    for char in text:
        if char not in LETTERS:
            new_text += char
    return new_text

#normalize arabic numbers
def normalize_arabic_numbers(text):
    text = re.sub("٠", "0", text)
    text = re.sub("١", "1", text)
    text = re.sub("٢", "2", text)
    text = re.sub("٣", "3", text)
    text = re.sub("٤", "4", text)
    text = re.sub("٥", "5", text)
    text = re.sub("٦", "6", text)
    text = re.sub("٧", "7", text)
    text = re.sub("٨", "8", text)
    text = re.sub("٩", "9", text)
    return text

# normalize arabizi characters
def normalize_arabic(text):
    text = re.sub("2", "a", text)
    text = re.sub("3", "aa", text)
    text = re.sub("7", "h", text)
    text = re.sub("9", "q", text)
    text = re.sub("8","gh", text)
    text = re.sub("5", "kh", text)
    text = re.sub("sh","ch", text)
    text = re.sub("dj","j", text)
    return text
###################################################################################################
#Data Cleaning and Lemmatization
def remove_noise(text):

    cleaned_tokens = []
    stop_words = stopwords.words('french')

    for token, tag in pos_tag(nltk.word_tokenize(text)):
        token = re.sub('http[s]?://(?:[a-zA-Z]|[$-_@.&+#]|[!*\(\),]|'\
                       '(?:%[0-9a-fA-F][0-9a-fA-F]))+','', token)
        token = re.sub("(@[A-Za-z]+)","", token)

        if tag.startswith("NN"):
            pos = 'n'
        elif tag.startswith('VB'):
            pos = 'v'
        else:
            pos = 'a'

        lemmatizer = WordNetLemmatizer()
        token = lemmatizer.lemmatize(token, pos)

        if len(token) > 0 and token not in string.punctuation and token.lower() not in stop_words:
            cleaned_tokens.append(token.lower())
    return cleaned_tokens
#remove vowel
def rem_vowel(text):
        vowels = ('a', 'e', 'i', 'o', 'u','é','à','è')
        for x in text.lower():
            if x in vowels:
                text = text.replace(x, "")
        return text


In [11]:
#this is the pipeline of preprocessing 
def my_preproc(data="",list_words=[]):
    text = remove_url(data)
    text= lower(text)
    text= remove_minlen_word(text,1)  #words containing one letter
    text=normalize_arabic_numbers(text) 
    text=remove_arabic_chars(text)
    text=remove_digits(text) 
    text=normalize_arabic(text)
    text=remove_prefix(text,"@")
    text=remove_prefix(text,"#")
    text=remove_words_generic(text, list_words) 
    text=remove_french_punctuations(text) 
    text=remove_fr_stop_words(text) 
    text=remove_repeating_char(text)
    text=remove_extra_white_spaces(text) 
    text=remove_emoji(text)
   # text=rem_vowel(text)
   # text=remove_noise(text)
    
    return text

In [14]:
def remove_t(text):
    if text.endswith('yt'):
        text = text[:-1] + "a"
    return text

In [17]:
def my_preproc_arabic(data="",list_words=[]):
    text = remove_url(data)
    text= lower(text)
    text= remove_minlen_word(text,1)  #words containing one letter
    text=normalize_arabic_numbers(text) 
    text=remove_arabic_chars(text)
    text=remove_digits(text) 
    text=normalize_arabic(text)
    text=remove_prefix(text,"@")
    text=remove_prefix(text,"#")
    text=remove_words_generic(text, list_words) 
    text=remove_french_punctuations(text) 
    text=remove_fr_stop_words(text) 
    text=remove_repeating_char(text)
    text=remove_extra_white_spaces(text) 
    text=remove_emoji(text)
    text=remove_t(text)
   # text=rem_vowel(text)
   # text=remove_noise(text)
    
    return text

In [7]:
text="rani 7aba nro7 l alg8ia 187"
my_preproc(data=text)


'rani haba nroh alghia'

In [12]:
import csv
import pandas as pd

def write_to_csv(comments):
    df = pd.Series(comments)
    df.to_csv('out.csv', index=False, header=False)  



In [66]:
data = pd.read_csv('dataset.csv',header=None)
df=data[0]

In [67]:
df = df.apply(lambda x:my_preproc(data=str(x)))

In [68]:
df = df.drop_duplicates()

In [71]:
write_to_csv(df)

In [3]:
import fasttext

PRETRAINED_MODEL_PATH = 'lid.176.bin'
model = fasttext.load_model(PRETRAINED_MODEL_PATH)

def predict_language(text):
    predictions = model.predict([text])
    return str(predictions[0])[12:14], predictions[1][0].item()



In [19]:
import pandas as pd

data = pd.read_csv('arabic_dataset.csv',header=None)
df=data[0]
df = df.apply(lambda x:my_preproc_arabic(data=str(x)))
df = df.drop_duplicates()
write_to_csv(df)

In [5]:
import pandas as pd 

df = pd.read_csv('preprocessed_datased.csv',header=None)

In [7]:
df

Unnamed: 0,0
0,alah ibarek artistes
1,yo voudrais jouer dayz xbox gamertag vaxfire
2,salam mabrok awachrkom daiw maya chi doriya sa...
3,merci beaucoup monsieur jamal l'efort l'explic...
4,amen
...,...
17340,abdelhak zwin mardi lwalidin
17341,top khouya
17342,ca rapel bele epoque rai q0
17343,c'est trop bien
