In [1]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn import metrics

In [2]:
train_df = pd.read_csv('data/train_dataset.csv')
test_df = pd.read_csv('data/test_dataset.csv')

In [3]:
train_df

Unnamed: 0,id,dialect,tweet
0,1186031522786230272,DZ,ايه راني بعتهالو ماخرجتش عندك
1,1102632838019592192,IQ,شيء مو كيفه يسويه
2,986991775234646016,TN,معلش حبيبتي هو اكيد مقدر ولكن له اسبابه واكيد ...
3,825136678855643136,YE,ماش ما عجبني قالو يخوف و طلع عادي
4,983055547980185600,SY,الف نعمه انو فش شيعه عندكم احمدو ربكم
...,...,...,...
408191,1169192062559891712,PL,انا وانا ماشيه بالشارع لحالي
408192,1134056408608137216,SY,الشغل مش عيب بس هالشي فعلا محزن وكثير اسباب مح...
408193,1119776894612848640,OM,امين يارب العالمين الله يرحمهم ويجمعكم واياهم ...
408194,1091109679617503232,EG,الواحد يغيب مهما يغيب عن المخروب ده ويرجع يلاق...


In [4]:
test_df

Unnamed: 0,id,dialect,tweet
0,948860366372601984,DZ,ههه للاسف ما اعرفك بس شخص راقي
1,1152585801865338752,AE,وام حمد لما عرفت السالفه قررت تفضح ولدها في ال...
2,901815108766224384,EG,كوم هتيم الوطن اهم الثار لا بيرد المقتول للحيا...
3,1105118511163162624,IQ,سطحي شعاري بالحياه حتي صديق مقرب ماعندي بس معارف
4,725062587948347392,KW,خطا بيلغيري بقرايه اللعب اصابه سيلفا حط دي برو...
...,...,...,...
49995,964237433389899776,BH,اذا خان او استغفلني ما اتردد وله دقيقه في انها...
49996,1173001622491979776,LY,عاجبني فيك ديما صريح وهذا من حق الصاحب علي صاحبه
49997,943617132230791168,LB,شو بيشتغل اميل رحمه
49998,1176586297617669888,QA,بتموت هي قبل تجيب ال


In [5]:
train_df['dialect'].value_counts(normalize=True)

EG    0.125788
PL    0.095466
KW    0.091902
LY    0.079658
QA    0.067808
JO    0.060936
LB    0.060273
SA    0.058558
AE    0.057389
BH    0.057382
OM    0.041720
SY    0.035449
DZ    0.035319
IQ    0.033822
SD    0.031502
MA    0.025184
YE    0.021666
TN    0.020179
Name: dialect, dtype: float64

In [6]:
class_weight = (1/train_df['dialect'].value_counts(normalize=True)).to_dict()

In [7]:
class_weight

{'EG': 7.94990846414521,
 'PL': 10.474890297415895,
 'KW': 10.88116436530362,
 'LY': 12.55369664165334,
 'QA': 14.747498103255174,
 'JO': 16.41054916780574,
 'LB': 16.59131000284518,
 'SA': 17.077186963979415,
 'AE': 17.424912490395286,
 'BH': 17.42714425991547,
 'OM': 23.96923076923077,
 'SY': 28.20981340704907,
 'DZ': 28.313518762571967,
 'IQ': 29.566565261480513,
 'SD': 31.743992534411696,
 'MA': 39.70778210116732,
 'YE': 46.15513342379014,
 'TN': 49.556391890251305}

# Model

In [8]:
vectorizer = TfidfVectorizer()
classifier = LogisticRegression(class_weight=class_weight)

In [9]:
pipeline = make_pipeline(vectorizer, classifier)

In [10]:
pipeline.fit(train_df.tweet, train_df.dialect)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('logisticregression',
                 LogisticRegression(class_weight={'AE': 17.424912490395286,
                                                  'BH': 17.42714425991547,
                                                  'DZ': 28.313518762571967,
                                                  'EG': 7.94990846414521,
                                                  'IQ': 29.566565261480513,
                                                  'JO': 16.41054916780574,
                                                  'KW': 10.88116436530362,
                                                  'LB': 16.59131000284518,
                                                  'LY': 12.55369664165334,
                                                  'MA': 39.70778210116732,
                                                  'OM': 23.96923076923077,
                                                  'PL': 10.474890297415895,
  

# Calculate metrics for train data

In [11]:
y_pred = pipeline.predict(train_df.tweet)

In [12]:
report = metrics.classification_report(train_df.dialect, y_pred, output_dict=True)

In [13]:
pd.DataFrame(report).T

Unnamed: 0,precision,recall,f1-score,support
AE,0.614491,0.609323,0.611896,23426.0
BH,0.542398,0.635743,0.585373,23423.0
DZ,0.649377,0.748561,0.69545,14417.0
EG,0.855686,0.757995,0.803883,51346.0
IQ,0.67286,0.797479,0.729888,13806.0
JO,0.69191,0.521951,0.595032,24874.0
KW,0.743734,0.605134,0.667313,37514.0
LB,0.74156,0.779417,0.760017,24603.0
LY,0.781519,0.750154,0.765515,32516.0
MA,0.71022,0.849708,0.773728,10280.0


# Calculate metrics for test data

In [14]:
y_pred = pipeline.predict(test_df.tweet)

In [15]:
report = metrics.classification_report(test_df.dialect, y_pred, output_dict=True)

In [16]:
pd.DataFrame(report).T

Unnamed: 0,precision,recall,f1-score,support
AE,0.410982,0.419861,0.415374,2870.0
BH,0.323787,0.386197,0.352249,2869.0
DZ,0.480475,0.550396,0.513064,1766.0
EG,0.775565,0.687281,0.728759,6290.0
IQ,0.507312,0.594914,0.547632,1691.0
JO,0.45709,0.326879,0.381171,3047.0
KW,0.574543,0.464635,0.513777,4595.0
LB,0.59726,0.650962,0.622956,3014.0
LY,0.674628,0.659553,0.667005,3983.0
MA,0.553061,0.645751,0.595823,1259.0


# Save the model

In [18]:
pickle.dump(pipeline, open('models/model.sav', 'wb'))