# Overview

**Dataset**
labeled datasset collected from twitter

**Objective**
Classify tweets according to the arabic dialect in the text. <br>

## Import Libraries

In [1]:
import os
import pandas as pd
import urllib

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import classification_report, f1_score

import joblib  # For saving model

In [2]:
SEED = 42

## Load Dataset

In [3]:
# Check whether we're using google colab
using_colab = None
CWD = os.getcwd()
if os.path.exists('/content'):
  using_colab = True
else:
  using_colab = False
  PWD = os.path.dirname(CWD)

In [4]:
if using_colab:  #Check if using Colab
    data_folder_path = CWD #+ "/data/"
else:
    data_folder_path = PWD + "/data/"
train_csv_file_path = data_folder_path + "/" + "arabic_dialects_train.csv"
test_csv_file_path = data_folder_path + "/" + "arabic_dialects_test.csv"

In [5]:
pd.set_option('display.max_colwidth', 0)

In [6]:
train_url = 'https://github.com/AmgadHasan/arabic-dialect-detection/raw/main/data/arabic_dialects_train.csv'
test_url = 'https://github.com/AmgadHasan/arabic-dialect-detection/raw/main/data/arabic_dialects_test.csv'

In [7]:
if not os.path.isfile(train_csv_file_path):
    # If the file doesn't exist, download it from the URL
    print("Downloading file...")
    urllib.request.urlretrieve(train_url, train_csv_file_path)
    print("File downloaded.")
else:
    print("File exists.")

File exists.


In [8]:
if not os.path.isfile(test_csv_file_path):
    # If the file doesn't exist, download it from the URL
    print("Downloading file...")
    urllib.request.urlretrieve(test_url, test_csv_file_path)
    print("File downloaded.")
else:
    print("File exists.")

File exists.


In [9]:
df = pd.read_csv(train_csv_file_path, lineterminator='\n')
pd.set_option('display.max_colwidth', 0)
df.head()

Unnamed: 0,tweet,label
0,انتي جافية والا الغلا ماتبيسؤالموسيقي,LY
1,باركوا لجوجو اجاها عريس,LB
2,لابأس عليك يالزينه ان شاء الله هذا حد السو,LY
3,الزن ده هو اكتر حاجة مدمرة للسلام النفسي واحنا بصراحة سلامنا النفسي اهم من كل شيء الحياة قصيرة ومش مستحملة حوارات كتير,EG
4,الكوت كلها ناس بتدور ع علاقات تانيه بس تكون احلي م اللي لسه خالصه اول انبارح,EG


# Train Validation Splitting

In [95]:
df.shape, df.columns

((132952, 2), Index(['tweet', 'label'], dtype='object'))

In [96]:
df_train, df_valid = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=SEED)

In [11]:
df_train.label.value_counts(), df_valid.label.value_counts(), df_valid.label.value_counts()

(EG    46685
 LY    29564
 LB    22369
 SD    11692
 MA    9346 
 Name: label, dtype: int64,
 EG    5187
 LY    3285
 LB    2486
 SD    1299
 MA    1039
 Name: label, dtype: int64,
 EG    5187
 LY    3285
 LB    2486
 SD    1299
 MA    1039
 Name: label, dtype: int64)

# Modelling

In [12]:
vec0 = CountVectorizer()
clf0 = LogisticRegression(max_iter=200)
pipe0 = make_pipeline(vec0, clf0)
pipe0.fit(df_train.tweet, df_train.label);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Evaluation f1_macro 0.791
Old: 0.78
New: 0.791

In [13]:
def print_report(pipe, x_test, y_test):
    y_pred = pipe.predict(x_test)
    report = classification_report(y_test, y_pred)
    print(report)
    print("macro f1_score: {:0.3f}".format(f1_score(y_test, y_pred, average='macro')))

In [14]:
print_report(pipe0, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.83      0.92      0.87      5187
          LB       0.86      0.84      0.85      2486
          LY       0.79      0.81      0.80      3285
          MA       0.84      0.69      0.76      1039
          SD       0.80      0.58      0.67      1299

    accuracy                           0.83     13296
   macro avg       0.83      0.77      0.79     13296
weighted avg       0.83      0.83      0.82     13296

macro f1_score: 0.791


# Enhancement

- Using different N-grams
- Using different text representation technique

## 1. Using TfidfVectorizer f1_macro  0.783
Old: 0.77
New: 0.783

In [15]:
vec1 = TfidfVectorizer()
clf1 = LogisticRegression()
pipe1 = make_pipeline(vec1, clf1)
pipe1.fit(df_train.tweet, df_train.label);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
feature_names = pipe1[0].get_feature_names_out()
feature_names.shape

(234140,)

In [17]:
print_report(pipe1, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.81      0.93      0.87      5187
          LB       0.86      0.82      0.84      2486
          LY       0.79      0.81      0.80      3285
          MA       0.87      0.66      0.75      1039
          SD       0.86      0.53      0.66      1299

    accuracy                           0.82     13296
   macro avg       0.84      0.75      0.78     13296
weighted avg       0.82      0.82      0.82     13296

macro f1_score: 0.783


<font color='darkOrange' size='4'>
Using Tf-Idf lowered the score!
</font>

## 2. Using SVC f1_macro 0.785
Old: 0.77
New: 0.785

In [18]:
vec2 = CountVectorizer()
clf2 = LinearSVC()
pipe2 = make_pipeline(vec2, clf2)
pipe2.fit(df_train.tweet, df_train.label);



In [19]:
feature_names = pipe2[0].get_feature_names_out()
feature_names.shape

(234140,)

In [20]:
print_report(pipe2, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.83      0.90      0.87      5187
          LB       0.86      0.83      0.85      2486
          LY       0.79      0.80      0.79      3285
          MA       0.80      0.71      0.75      1039
          SD       0.75      0.60      0.67      1299

    accuracy                           0.82     13296
   macro avg       0.81      0.77      0.78     13296
weighted avg       0.82      0.82      0.82     13296

macro f1_score: 0.785


<font color='darkOrange' size='4'>
Using SVM Classifier did NOT improve the score!
</font>

## 3. Using ngram Tf-Idf + SVC f1_macro 0.81
Old: 0.80
New: 0.81

In [21]:
vec3 = TfidfVectorizer( ngram_range=(1, 2))
clf3 = LinearSVC()
pipe3 = make_pipeline(vec3, clf3)
pipe3.fit(df_train.tweet, df_train.label);

In [22]:
feature_names = pipe3[0].get_feature_names_out()
feature_names.shape

(1331139,)

In [23]:
print_report(pipe3, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.84      0.94      0.89      5187
          LB       0.87      0.86      0.86      2486
          LY       0.83      0.82      0.82      3285
          MA       0.85      0.72      0.78      1039
          SD       0.84      0.59      0.70      1299

    accuracy                           0.84     13296
   macro avg       0.84      0.79      0.81     13296
weighted avg       0.84      0.84      0.84     13296

macro f1_score: 0.810


<font color='darkOrange' size='4'>
Using ngram_range=(1, 2)  improved the score!
</font>

## 4. Using ngram Tf-Idf + Naive Bayes f1_macro 0.512

In [24]:
vec4 = TfidfVectorizer( ngram_range=(1, 2))
clf4 = MultinomialNB(force_alpha=True)
pipe4 = make_pipeline(vec4, clf4)
pipe4.fit(df_train.tweet, df_train.label);

In [25]:
feature_names = pipe4[0].get_feature_names_out()
feature_names.shape

(1331139,)

In [26]:
print_report(pipe4, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.57      0.99      0.72      5187
          LB       0.97      0.62      0.75      2486
          LY       0.83      0.60      0.70      3285
          MA       1.00      0.16      0.28      1039
          SD       0.99      0.05      0.10      1299

    accuracy                           0.67     13296
   macro avg       0.87      0.49      0.51     13296
weighted avg       0.78      0.67      0.63     13296

macro f1_score: 0.512


<font color='darkOrange' size='4'>
Using naive bayes did NOT improve the score!
</font>

## 5. Using another ngram Tf-Idf + SVC f1_macro 0.802
Old: 0.79
New: 0.802

In [27]:
vec5 = TfidfVectorizer( ngram_range=(1, 3))
clf5 = LinearSVC()
pipe5 = make_pipeline(vec5, clf5)
pipe5.fit(df_train.tweet, df_train.label);

In [28]:
feature_names = pipe5[0].get_feature_names_out()
feature_names.shape

(2625707,)

In [29]:
print_report(pipe5, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.83      0.93      0.88      5187
          LB       0.86      0.85      0.86      2486
          LY       0.82      0.81      0.82      3285
          MA       0.84      0.71      0.77      1039
          SD       0.83      0.58      0.69      1299

    accuracy                           0.84     13296
   macro avg       0.84      0.78      0.80     13296
weighted avg       0.84      0.84      0.83     13296

macro f1_score: 0.802


## 6. Using ngram Count_vect + SVC f1_macro 0.796
Old: 0.80
New: 0.796

In [30]:
vec6 = CountVectorizer( ngram_range=(1, 2))
clf6 = LinearSVC()
pipe6 = make_pipeline(vec6, clf6)
pipe6.fit(df_train.tweet, df_train.label);



In [31]:
feature_names = pipe6[0].get_feature_names_out()
feature_names.shape

(1331139,)

In [32]:
print_report(pipe6, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.83      0.92      0.88      5187
          LB       0.86      0.84      0.85      2486
          LY       0.80      0.82      0.81      3285
          MA       0.86      0.70      0.77      1039
          SD       0.81      0.57      0.67      1299

    accuracy                           0.83     13296
   macro avg       0.83      0.77      0.80     13296
weighted avg       0.83      0.83      0.83     13296

macro f1_score: 0.796


## 7. Using ngram Tf-Idf + Logistic f1_macro 0.771
Old: 0.
New: 0.771

In [33]:
vec7 = TfidfVectorizer( ngram_range=(1, 2))
clf7 = LogisticRegression()
pipe7 = make_pipeline(vec7, clf7)
pipe7.fit(df_train.tweet, df_train.label);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [34]:
feature_names = pipe7[0].get_feature_names_out()
feature_names.shape

(1331139,)

In [35]:
print_report(pipe7, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.80      0.93      0.86      5187
          LB       0.85      0.81      0.83      2486
          LY       0.78      0.80      0.79      3285
          MA       0.85      0.64      0.73      1039
          SD       0.85      0.52      0.64      1299

    accuracy                           0.81     13296
   macro avg       0.83      0.74      0.77     13296
weighted avg       0.81      0.81      0.81     13296

macro f1_score: 0.771


<font color='darkOrange' size='4'>
Using logistic regression lowered the score!
</font>

In [36]:
pipe_stack = StackingClassifier(estimators=[('bigram-svm',pipe3), ('trigram-svm', pipe5)], final_estimator=LogisticRegression())
pipe_stack.fit(df_train.tweet, df_train.label);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [39]:
print_report(pipe_stack, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.86      0.92      0.89      5187
          LB       0.87      0.86      0.87      2486
          LY       0.82      0.84      0.83      3285
          MA       0.83      0.75      0.79      1039
          SD       0.82      0.64      0.72      1299

    accuracy                           0.85     13296
   macro avg       0.84      0.80      0.82     13296
weighted avg       0.85      0.85      0.85     13296

macro f1_score: 0.820


In [42]:
pipe_stack2 = StackingClassifier(estimators=[('bigram-svm',pipe3), ('trigram-svm', pipe5)], final_estimator=LinearSVC())
pipe_stack2.fit(df_train.tweet, df_train.label);



In [43]:
print_report(pipe_stack2, df_valid.tweet, df_valid.label)

              precision    recall  f1-score   support

          EG       0.85      0.93      0.89      5187
          LB       0.87      0.86      0.87      2486
          LY       0.83      0.83      0.83      3285
          MA       0.85      0.74      0.79      1039
          SD       0.83      0.62      0.71      1299

    accuracy                           0.85     13296
   macro avg       0.85      0.80      0.82     13296
weighted avg       0.85      0.85      0.84     13296

macro f1_score: 0.817


# Final Pipeline:
ngram Tf-Idf + SVC

In [44]:
final_model = StackingClassifier(estimators=[('bigram-svm',pipe3), ('trigram-svm', pipe5)], final_estimator=LogisticRegression())

##  Retraining on train + valid

In [46]:
# Merge train and valid to gain small performance boost 
# (We still have a held-out test set to assess the model)
df_train_total = pd.concat([df_train, df_valid], axis=0).sample(frac=1)
df_train_total

Unnamed: 0,tweet,label
59559,ديماجمالي في تصريح لها اذا بكرا ما خلصت الموازنة ما بكون اسمي ديما صادق,LB
56113,والله يوائل ياريت اتكوان الفقره هدي كل يوم جمعه,LY
131901,المستفيذ الوحيد في هالمعمعه حكومة الكلاب كيف ما تمشي تجيهممعيتيق اعطي الامر للكانيات لدخول طرابلس السراج اعطي الامر للطرابلسي وبادي والزوز مفهمين مليشيات طرابلس انهم معاهمهاسيديها,LY
84003,واضح انك ما حكيت عن السيد بس لو عنجد حضرتك بتحترمو ما بتحكي عندو على شيخ وبتشتم فيه وهوي ما جاب سيرتو الاحترام يقتضي انتقاء الالفاظ المناسبة لمقام يلي عم نحكي معه او عنده,LB
53193,شفت حاجات على لينكد ان,EG
...,...,...
110218,يعني هانت انا يا خوفي من الفرحة يوم خروجه تجيني جلطة واموت,LY
41007,مش عارف ليه لما بتفرج على الحلقة بتاعتك وتيجي تخلص بحس انى زعلان انها خلصت مبدع كالعادة وبالتوفيق دايمااااا,EG
21503,سؤال محيرالسودانيين لمن يجو يقشرو علي بعض بكتب بالعربي ويجدع ليه كلمة كلمتين بالانجيزي ف تبقي الرسالة نصها عربي ونصها انجليزيوالانجليز والامريكان مثلا لمن يجو يقشرو بتكلمو باالانجليزي ويجدعو ليهم كلمة بالعربي اثناء الرسالة نحتاج لتفسير منبرالمغردينالسودانيين,SD
24372,ما بعطي رقمي لمين ما كان شكرا,LB


In [49]:
final_model.fit(df_train_total.tweet, df_train_total.label);

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Assessing the model on test set

In [48]:
df_test = pd.read_csv(test_csv_file_path)
df_test

Unnamed: 0,tweet,label
0,شفت دي ف الفيفوريت عندكمش تقول يا راجل من بدري,EG
1,علي اعتبار إن أعداءنا دلوقت مش اللي ف حارة اليهود لأ أبسيليوتلي دول بقوا اللي فالحارة المزنوقه,EG
2,اكيد هو حر لكن الطريقة كيفاش دخل فالنص غلط وشي فانز كانو من مدة كيضغطو على اهاب يرجعله فولو ما عرفتش واش عمله او لا,MA
3,زي هدا وزي عبدالرحمن بيكرهوني في عروبتي,LY
4,على ما يبدو دمها خفيف,LB
...,...,...
14768,يارب يكون احسن حاجه بجد نفرح جدااااااا عقبال مانصنع وتنتج وتزرع ومانحتاج لحد يارب,EG
14769,كيف تتجنب أمراض الكلى,SD
14770,هههههه على كلشي المهام الموكلة لنا ههههه,MA
14771,ايه موقع شيزلونج دا منصة للعلاج النفسي على الانترنت عن طريق مكالمات فيديو بمقابل مادي الموقع ده شبيه ومجاني لكن دعم للاكتئاب وكرب ما بعد الصدمة فقط,EG


In [50]:
print_report(final_model, df_test.tweet, df_test.label)

              precision    recall  f1-score   support

          EG       0.87      0.92      0.89      5764
          LB       0.87      0.86      0.87      2762
          LY       0.83      0.85      0.84      3650
          MA       0.85      0.74      0.80      1154
          SD       0.82      0.68      0.74      1443

    accuracy                           0.85     14773
   macro avg       0.85      0.81      0.83     14773
weighted avg       0.85      0.85      0.85     14773

macro f1_score: 0.828


<font color='Green' size='6'>
Final Model f1_macro: 0.828<br>
YAY!
</font>

## Saving the model

In [71]:
final_model.predict(['هلا ومرحبا بالطعمية شو اخبارك'])

array(['LB'], dtype=object)

In [65]:
if using_colab:
  models_folder_path = CWD 
else:
  models_folder_path = PWD + "/models/"

In [75]:
joblib.dump(final_model, models_folder_path + "/" "machine_model.joblib", compress=9)

['/content/machine_model.joblib']

In [77]:
tmp = joblib.load('/content/machine_model.joblib')
# took 45 seconds to load

In [78]:
tmp.predict(['هلا ومرحبا بالطعمية شو اخبارك'])

array(['LB'], dtype=object)

In [79]:
joblib.dump(pipe3, models_folder_path + "/" "svm_model.joblib", compress=9)

['/content/svm_model.joblib']

In [81]:
tmp2 = joblib.load('/content/svm_model.joblib')
# took 10 seconds to load

In [82]:
tmp2.predict(['هلا ومرحبا بالطعمية شو اخبارك'])

array(['LB'], dtype=object)