#### Import Datasets:

In [3]:
import re
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm
import arabic_reshaper
import pyarabic.araby as araby
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from bidi.algorithm import get_display
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

In [4]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rawan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rawan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Import Data:

In [7]:
code_to_country = {
    "AE": "United Arab Emirates",
    "BH": "Bahrain",
    "DZ": "Algeria",
    "EG": "Egypt",
    "IQ": "Iraq",
    "JO": "Jordan",
    "KW": "Kuwait",
    "LB": "Lebanon",
    "LY": "Libya",
    "MA": "Morocco",
    "OM": "Oman",
    "PS": "Palestine",
    "PL": "Palestine",
    "QA": "Qatar",
    "SA": "Saudi Arabia",
    "SD": "Sudan",
    "SY": "Syria",
    "TN": "Tunisia",
    "YE": "Yemen",
    "MSA": "Modern Standard Arabic"
}


def read_data(path):
    data = {'label': [], 'text': []}
    
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            try:
                label, text = line.strip().split(' ', 1)
            except: # Discard missing/corrupt data
                pass
            
            label = label.replace("__label__", "")
            data['label'].append(code_to_country[label])
            data['text'].append(text)
            
    return pd.DataFrame(data)


main_dir = 'D:/NLP Project/Arabic-Dialect-Identification-using-LLAMA-3/data/'
train = read_data(main_dir + 'QADI_train.txt')
test = read_data(main_dir + 'QADI_test .txt')
train.to_csv(main_dir + "train.csv",index=False)
test.to_csv(main_dir + "test.csv",index=False)



train_size, test_size = len(train), len(test)
print("train size:", train_size)
print("test size:", test_size)
print("Total:", train_size + test_size)

train size: 492235
test size: 3797
Total: 496032


#### Sample part of dataset (10000 row from each class or dialect):
    
- I choose to sample the longest length rows in eacg class, some papers prove that the accutacy increase with the increasing of text length.

- I sample only 10000 row from each label.

In [8]:
train["text_length"] = train["text"].apply(len)
train = train.sort_values(by="text_length",ascending=False)
train.reset_index(inplace=True,drop=True)

n_samples = 10000

train_subset = pd.DataFrame()
for label in (np.unique(train["label"])):
    df_label= train[train["label"]==label].head(n_samples)
    train_subset= pd.concat([train_subset,df_label])

train_subset.reset_index(inplace=True,drop=True)

#### Preprocessing Data:

In [9]:
arabic_stopwords = set(stopwords.words('arabic'))
arabic_stopwords.update(('الله', 'والله', 'اللي','علي','على','مع','لا','من','ما','في','الي','هو','انا','أنا','اله'))

def clean_text(text):
    text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    text = re.sub(r'[^\w\s]', '', text)  # Remove non-alphanumeric characters
    text = re.sub(r'[A-Za-z]', '', text)  # Remove English letters (assuming Arabic text)
    text = re.sub(r'(.)\1+', r'\1', text).strip()  # Remove consecutive duplicate characters and strip leading/trailing spaces
    text = ' '.join(w for w in araby.tokenize(text) if len(w)>1)  # Tokenize Arabic text and join tokens with a space if their length is greater than 1
    return text.strip()  # Strip leading and trailing spaces
#==========================================================================

# Function to normalize Arabic text
def normalize_text(text, stem=False):    
    # Normalize Arabic characters
    text = re.sub("[إأآا]", "ا", text)
    text = re.sub("ى", "ي", text)
    text = re.sub("ؤ", "ء", text)
    text = re.sub("ئ", "ء", text)
    text = re.sub("ة", "ه", text)
    text = re.sub("گ", "ك", text)
    # Perform stemming if specified
    if stem:
        stemmer = FarasaStemmer()
        text = stemmer.stem(text)
    return text
#==========================================================================

# Function to remove stopwords from text
def remove_stopwords(text):
    tokens = word_tokenize(text) # Tokenize the text
    filtered_tokens = [word for word in tokens if word not in arabic_stopwords] # Remove stopwords
    filtered_text = ' '.join(filtered_tokens) # Join the filtered tokens back into text
    return filtered_text
#==========================================================================

# Function to preprocess text by cleaning, normalizing, and removing stopwords
def preprocess(text):
    text = clean_text(text) # Clean the text
    text = remove_stopwords(text) # Remove stopwords
    text = normalize_text(text) # Normalize the text
    return text

In [10]:
train_subset['text_cleaned'] = train_subset['text'].apply(preprocess)
test['text_cleaned'] = test['text'].apply(preprocess)

In [12]:
train_subset.to_csv(main_dir + "train_cleaned.csv",index=False)
test.to_csv(main_dir + "test_cleaned.csv",index=False)