# Arabic dialect identification
## The second step
#### Auther: Rawan Hahi

In [56]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.naive_bayes import GaussianNB,ComplementNB,MultinomialNB,BernoulliNB
import numpy as np
from time import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer


In [44]:
train_data_path = 'Cleaned_Training_Data.csv'
test_data_path = 'Cleaned_Test_Data.csv'

In [45]:
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

In [46]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,tweet,dialect
0,0,حاجة حلوة اكيد,Egypt
1,1,عم بشتغلوا للشعب الاميركي اما نحن يكذبوا ويغشو...,Iraq
2,2,ابشر طال عمرك,Saudi_Arabia
3,3,منطق 2017: أنا والغريب علي إبن عمي وأنا والغري...,Mauritania
4,4,شهرين وتروح والباقي غير صيف ملينا,Algeria


In [47]:
test_data.head()

Unnamed: 0.1,Unnamed: 0,tweet,dialect
0,0,قولنا اون لاين لا يا علي اون لاين لا,Egypt
1,1,ههههه بايخه ههههه URL …,Oman
2,2,ربنا يخليك يا دوك ولك المثل :D,Lebanon
3,3,#اوامر_ملكيه ياشباب اي واحد فيكم عنده شي يذكره...,Syria
4,4,شد عالخط حتى هيا اكويسه,Libya


In [48]:
X_train=train_data['tweet'].values.tolist()
Y_train=pd.get_dummies(train_data['dialect'].values)

In [49]:
X_test=test_data['tweet'].values.tolist()
Y_test=pd.get_dummies(test_data['dialect'].values)

## ConutVectorizer
##### ngram_range = (1, 2)


In [50]:
cv = CountVectorizer(binary=False,ngram_range=(1,2),
                     max_features=1500000)
cv.fit(X_train)


CountVectorizer(max_features=1500000, ngram_range=(1, 2))

In [51]:
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test) 

In [52]:
def train_predict(classifier,X_train,y_train,X_test,y_test, average= 'macro'): 
    '''

    inputs:
       - classifier: the learning algorithm to be trained and predicted on
       - x_train  : train set Features
       - y_train  : train set Labels
       - x_test   : test set Features
       - y_train  : test set Labels
 
    '''
    
    results = {}
    
    # Fit the classifier to the training data and Calculate the time
    start_train = time()
    classifier = classifier.fit(X_train, y_train)
    end_train = time()
    results['train_time'] = round((end_train-start_train), 2)

    # Predict the training/testing labels and Calculate the time
    start_test = time()
    y_pred_train = classifier.predict(X_train)
    y_pred_test = classifier.predict(X_test)
    end_test = time()   
    results['pred_time'] = round((end_test-start_test), 2)
            
    # Model evaluation (training/testing data) using accuracy
    results['acc_train'] = round(accuracy_score(y_train, y_pred_train), 2)
    results['acc_test'] = round(accuracy_score(y_test, y_pred_test), 2)
    
    # Model evaluation (training/testing data) using f1-score
    results['f1_train'] = round(f1_score(y_train, y_pred_train, average= average), 2)
    results['f1_test'] = round(f1_score(y_test, y_pred_test, average= average), 2)
    
    # Display classification_report
    report = classification_report(y_test, y_pred_test)
       
    
    # Return the results
    return results, report

## Multinomial Naive Bayes classifier 

In [53]:
clf_MNB = MultinomialNB()
clf_MNB_evaluation = {}
clf_MNB_report = {}

clf_MNB_evaluation, clf_MNB_report= train_predict(clf_MNB,
                                                  X_train_cv, np.argmax(Y_train.to_numpy(), axis=1),
                                                  X_test_cv, np.argmax(Y_test.to_numpy(), axis=1)
                                                                                                                                                                                           )
print(clf_MNB_evaluation )

{'train_time': 0.41, 'pred_time': 0.15, 'acc_train': 0.88, 'acc_test': 0.36, 'f1_train': 0.81, 'f1_test': 0.11}


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
clf_MNB = MultinomialNB()
clf_LR = LogisticRegression()
clf_SGD=SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,
                      tol=None)
clf_list = [ clf_MNB, clf_LR, clf_SGD]

# Compare evaluation of the different classifiers
clf_evaluation = {}
clf_report = {}

for clf in clf_list:
    clf_evaluation[clf.__class__.__name__], clf_report[clf.__class__.__name__] = train_predict(clf,
                                                                                              X_train_cv,
                                                                                              np.argmax(Y_train.to_numpy(), axis=1),
                                                                                              X_test_cv,
                                                                                              np.argmax(Y_test.to_numpy(), axis=1)
                                                                                              )
print(clf_evaluation )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


{'MultinomialNB': {'train_time': 0.41, 'pred_time': 0.1, 'acc_train': 0.88, 'acc_test': 0.36, 'f1_train': 0.81, 'f1_test': 0.11}, 'LogisticRegression': {'train_time': 165.11, 'pred_time': 0.16, 'acc_train': 1.0, 'acc_test': 0.38, 'f1_train': 1.0, 'f1_test': 0.15}, 'SGDClassifier': {'train_time': 249.98, 'pred_time': 0.13, 'acc_train': 0.97, 'acc_test': 0.39, 'f1_train': 0.97, 'f1_test': 0.19}}


## TfIDf

In [59]:
cv = TfidfVectorizer()
cv.fit(X_train)
print("vocabulary : ",len(cv.vocabulary_))

vocabulary :  58079


In [60]:
X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test) 

In [61]:
clf_MNB = MultinomialNB()
clf_LR = LogisticRegression()
clf_SGD=SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42,
                      tol=None)
clf_list = [ clf_MNB, clf_LR, clf_SGD]

# Compare evaluation of the different classifiers
clf_evaluation = {}
clf_report = {}

for clf in clf_list:
    clf_evaluation[clf.__class__.__name__], clf_report[clf.__class__.__name__] = train_predict(clf,
                                                                                              X_train_cv,
                                                                                              np.argmax(Y_train.to_numpy(), axis=1),
                                                                                              X_test_cv,
                                                                                              np.argmax(Y_test.to_numpy(), axis=1)
                                                                                              )
print(clf_evaluation )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'MultinomialNB': {'train_time': 0.14, 'pred_time': 0.04, 'acc_train': 0.44, 'acc_test': 0.29, 'f1_train': 0.15, 'f1_test': 0.06}, 'LogisticRegression': {'train_time': 47.16, 'pred_time': 0.07, 'acc_train': 0.64, 'acc_test': 0.37, 'f1_train': 0.35, 'f1_test': 0.13}, 'SGDClassifier': {'train_time': 125.56, 'pred_time': 0.06, 'acc_train': 0.91, 'acc_test': 0.37, 'f1_train': 0.92, 'f1_test': 0.19}}
