In [1]:
from __future__ import print_function
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

In [2]:
#recupération des données
TRAIN_SET_PATH = "/media/sameh/data/france/madar_shared_task/MADAR-Shared-Task-Subtask-1/MADAR-Corpus-26-train.txt"
import pandas as pd

with open(TRAIN_SET_PATH, "r", encoding='utf_8') as infile:
    X, y = [], []
    for line in infile:
        text, label  = line.split("\t")
        X.append(text)
        y.append(label)
data = {'phrase':X,'label':y}
df= pd.DataFrame(data)


In [3]:
#pretraitement des données 
import re 
import tashaphyne ,sys 
import tashaphyne.arabic_const as arabcons
def strip_tashkeel(text): 
    return tashaphyne.arabic_const.HARAKAT_PAT.sub('', text) 
def strip_tatweel(text): 
    return re.sub(tashaphyne.arabic_const.TATWEEL, '', text) 
def normalize_hamza(text):
    text = tashaphyne.arabic_const.ALEFAT_PAT.sub(tashaphyne.arabic_const.ALEF, text) 
    return tashaphyne.arabic_const.HAMZAT_PAT.sub(tashaphyne.arabic_const.HAMZA, text) 
def normalize_lamalef(text):
    return tashaphyne.arabic_const.LAMALEFAT_PAT.sub(\
                                       u'%s%s'%(tashaphyne.arabic_const.LAM, tashaphyne.arabic_const.ALEF), text) 
def normalize_spellerrors(text): 
    text = re.sub(tashaphyne.arabic_const.TEH_MARBUTA,tashaphyne.arabic_const.HEH, text) 
    return re.sub(tashaphyne.arabic_const.ALEF_MAKSURA,tashaphyne.arabic_const.YEH, text)

arabic_diacritics = re.compile("""
                             ّ    | 
                             َ    | 
                             ً    | 
                             ُ    | 
                             ٌ    | 
                             ِ    | 
                             ٍ    | 
                             ْ    | 
                             ـ     
                         """, re.VERBOSE)

arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
listtrain=[]
#supprime les diactritics 
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text
#supprime les signes des ponctuations 
def remove_punctuations(text):
    translator = str.maketrans('', '', arabic_punctuations)
    return text.translate(translator)
#Supprimer les nombres 
def remove_numbers(text):
    regex = re.compile(r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+")
    return re.sub(regex, ' ', text)
#supprimer les noms non arabe 
def remove_non_arabic_words(text):
    return ' '.join([word for word in text.split() if not re.findall(
        r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]',
        word)])
train = list(df['phrase'])
for data in train:
    data = normalize_spellerrors(data)
    data = normalize_lamalef(data)
    data = normalize_hamza(data)
    data = strip_tashkeel(data)
    data = strip_tatweel(data)
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtrain.append(data)


In [4]:
df['phrase']= listtrain

In [5]:

from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_pipeline_imb
from sklearn.naive_bayes import MultinomialNB
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]
#Pipeline
text_clf = make_pipeline_imb( TfidfVectorizer(tokenizer=stemming_tokenizer,analyzer='word',ngram_range=(1,1)),
                      RandomUnderSampler(),
                     MultinomialNB(alpha=0.5,fit_prior=True, class_prior=None)) 
text_clf.fit(df['phrase'], df['label'])

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth...ng_strategy='auto')), ('multinomialnb', MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True))])

In [6]:
TRAIN_SET_PATH = "/media/sameh/data/france/madar_shared_task/MADAR-Shared-Task-Subtask-1/MADAR-Corpus-26-dev.tsv"
with open(TRAIN_SET_PATH, "r", encoding='utf_8') as infile:
    xt,yt = [],[]
    for line in infile:
        text, label  = line.split("\t")
        xt.append(text)
        yt.append(label)
#recupére donée de test dans data frame 
datatest = {'phrase':xt,'label':yt}
dftest= pd.DataFrame(datatest)
test = list(dftest['phrase'])
listtest=[]
for data in test:
    data = normalize_spellerrors(data)
    data = normalize_lamalef(data)
    data = normalize_hamza(data)
    data = strip_tashkeel(data)
    data = strip_tatweel(data)
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtest.append(data)
dftest['phrase']= listtest
predicted = text_clf.predict(dftest['phrase'])
print (len(predicted))
print('Accuracy = ',np.mean(predicted == dftest['label']))
#print(dftest)

5200
Accuracy =  0.5963461538461539


In [7]:

import kenlm , math, glob 
import os,sys
import pandas as pd
#function qui return la liste de score de chaque phrase
def language(text,code):
    text = ' '.join('#'.join(text.split()))
    nommodel = '/media/sameh/data/CORPUS/LM-corpus/modelm/'+ code +'.binary'
    model=kenlm.LanguageModel(nommodel)
    prob = math.pow(10,model.score(text))
    #prob = math.floor(1000*prob)/1000            
    #print('listscore',len(listscore))
    return prob

class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        res=[{'score': x[0],'model':x[1]} 
            for x in X[[ 'score','model']].values]
        return res


class ItemSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

# function qui return la meilleur score+ le nom d model 
corp = ['ale-char', 'asw-char', 'msa-char', 'sal-char', 'jed-char', 'bag-char', 'san-char', 'kha-char', 'rab-char', 'mus-char', 'jer-char', 'bas-char', 'tri-char', 'alg-char', 'riy-char', 'amm-char', 'tun-char', 'fes-char', 'ben-char', 'alx-char', 'sfx-char', 'dam-char', 'bei-char', 'cai-char', 'mos-char', 'doh-char']
#print(models)
def scoremodel(s):
    m = list(map(lambda code: kenlm.LanguageModel('/media/sameh/data/CORPUS/LM-corpus/modelm/' + code + ".binary"), corp))

    s = ' '.join('#'.join(s.split()))
    maxl = ''
    maxp =  -sys.maxsize + 1 #minimum integer in python
    totalp = 0.0
    for j in range(len(m)):
        model = m[j]
        prob = model.score(s)
        totalp += math.pow(10.0, prob)
        if(prob > maxp):
            maxp = prob
            maxl = corp[j]
    #if else yetna7aw et round to proba sera dans boucle for pui l'ajout dans liste  
    if(totalp==0.0): prob = 0.0
    else: prob = math.pow(10.0,maxp)/totalp
    #round to thousandths
    prob = math.floor(1000*prob)/1000
    tupler=(maxl,prob)
    return tupler       

In [8]:
#ajouter au df 1 column pour 26 score par phrase
listscoreph=[ ]
listmodelph=[ ]
for i in list(df['phrase']):
    listscoreph.append(scoremodel(i)[1])
    listmodelph.append(scoremodel(i)[0])
df['score'] = listscoreph
df['model'] = listmodelph
print(df.shape)

(41600, 4)


In [9]:
print(df)

                                                  phrase  label  score  \
0                               امام بيانات الساءح تماما  MSA\n  0.999   
1              لم اسمع بهذا العنوان من قبل بالقرب من هنا  MSA\n  0.999   
2            استمر في السير في هذا الطريق حتي تجد صيدليه  MSA\n  0.999   
3                                       كم تكلفه الافطار  MSA\n  0.521   
4                                     كيف استطيع مساعدتك  MSA\n  0.994   
5                         اتجه يسارا عند الناصيه الثالثه  MSA\n  0.999   
6                       هل تحب ان تضع قشده وسكر في قهوتك  MSA\n  0.999   
7          هل يمكنكم صرف الشيك ذو الماءتي دولار الخاص بي  MSA\n  0.999   
8                            اذا حدث ذلك من فضلك اتصل بي  MSA\n  0.999   
9                                             اين المقهي  MSA\n  0.300   
10                                    اريد جاكيت للاطفال  MSA\n  0.993   
11     حسنا انها تتكلف خمسه وثمانون دولارا في البريد ...  MSA\n  0.999   
12                   هل لديك احذيه حري

In [10]:
class Feature3(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""
    def fit(self, x, y=None):
        return self

    def transform(self, X):
        res=[{'model':x[0]} 
            for x in X[['model']].values]
        return res
pipeline = Pipeline([

      ('features', FeatureUnion(
           transformer_list =[

       ('feature1', Pipeline([
                    ('selector', ItemSelector(key='phrase')),
('tfidf', TfidfVectorizer(tokenizer=stemming_tokenizer,analyzer='word',ngram_range=(1,1))),
        ])),
       ('feature2', Pipeline([
            ('stats',TextStats()), ('vect', DictVectorizer())

       ]))
           ]
)),

    ('clasifier', MultinomialNB(alpha=0.5,fit_prior=True, class_prior=None)),
                 ])

pipeline.fit(df, df['label'])

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('feature1', Pipeline(memory=None,
     steps=[('selector', ItemSelector(key='phrase')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inp...ansformer_weights=None)), ('clasifier', MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True))])

In [11]:
#pour evaluation 
listscoretest=[ ]
listmodeltest=[]
for i in list(dftest['phrase']):
    listscoretest.append(scoremodel(i)[1])
    listmodeltest.append(scoremodel(i)[0])
dftest['score'] = listscoretest
dftest['model'] = listmodeltest
#data2 =  {'phrase':listtest, 'score':listscoretest}
#dftest2 =  pd.DataFrame(data2)

In [12]:
ytest = pipeline.predict(dftest)
print('Accuracy = ',np.mean(ytest==dftest['label']))
print(classification_report(ytest,dftest['label']))

Accuracy =  0.8534615384615385
              precision    recall  f1-score   support

        ALE
       0.86      0.81      0.83       213
        ALG
       0.93      0.98      0.95       190
        ALX
       0.78      0.82      0.79       190
        AMM
       0.80      0.80      0.80       199
        ASW
       0.87      0.76      0.81       228
        BAG
       0.89      0.71      0.79       249
        BAS
       0.82      0.89      0.85       185
        BEI
       0.87      0.90      0.88       194
        BEN
       0.85      0.90      0.87       189
        CAI
       0.80      0.85      0.82       186
        DAM
       0.77      0.84      0.80       183
        DOH
       0.85      0.72      0.78       238
        FES
       0.90      0.92      0.91       196
        JED
       0.87      0.83      0.85       210
        JER
       0.80      0.78      0.79       206
        KHA
       0.86      0.89      0.88       195
        MOS
       0.91      0.93      0.92       

In [14]:
#create pour chaque phrase leur score par différent modéle 
import subprocess, os
corpdir = os.path.dirname("/media/sameh/data/CORPUS/LM-corpus/modelm/*.binary ")
for root,dirs,files in os.walk(corpdir):
    for file in files:
        lissc=[]
        code = file[:8]
        for i in listtrain:
            lissc.append(language(i,code))            
        df[code]=lissc

"""corp = {'ale-char':0, 'asw-char':0, 'msa-char':0, 'sal-char':0, 'jed-char':0, 'bag-char':0, 'san-char':0,
        'kha-char':0, 'rab-char':0, 'mus-char':0, 'jer-char':0, 'bas-char':0, 'tri-char':0, 'alg-char':0, 
        'riy-char':0, 'amm-char':0, 'tun-char':0, 'fes-char':0, 'ben-char':0, 'alx-char':0, 'sfx-char':0,
        'dam-char':0, 'bei-char':0, 'cai-char':0, 'mos-char':0, 'doh-char':0}

def scoremodel_fethi(sentence):
    dialects = corp.keys()
    for did in dialects:
        model = kenlm.LanguageModel('/media/sameh/data/CORPUS/LM-corpus/modelm/' + did + ".binary")
        corp[did] = model.score(sentence)
    return corp
"""

'corp = {\'ale-char\':0, \'asw-char\':0, \'msa-char\':0, \'sal-char\':0, \'jed-char\':0, \'bag-char\':0, \'san-char\':0,\n        \'kha-char\':0, \'rab-char\':0, \'mus-char\':0, \'jer-char\':0, \'bas-char\':0, \'tri-char\':0, \'alg-char\':0, \n        \'riy-char\':0, \'amm-char\':0, \'tun-char\':0, \'fes-char\':0, \'ben-char\':0, \'alx-char\':0, \'sfx-char\':0,\n        \'dam-char\':0, \'bei-char\':0, \'cai-char\':0, \'mos-char\':0, \'doh-char\':0}\n\ndef scoremodel_fethi(sentence):\n    dialects = corp.keys()\n    for did in dialects:\n        model = kenlm.LanguageModel(\'/media/sameh/data/CORPUS/LM-corpus/modelm/\' + did + ".binary")\n        corp[did] = model.score(sentence)\n    return corp\nfor i in listtrain :\n    scoremodel_fethi(i)\n    corp[\'phrase\'] = i\ndfunion = pd.DataFrame(corp)\nprint(dfunion.shape)'