In [5]:
from __future__ import print_function
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
#recupération des données
TRAIN_SET_PATH = "/media/sameh/data/france/madar_shared_task/MADAR-Shared-Task-Subtask-1/MADAR-Corpus-26-train.txt"
import pandas as pd

with open(TRAIN_SET_PATH, "r", encoding='utf_8') as infile:
    X, y = [], []
    for line in infile:
        text, label  = line.split("\t")
        X.append(text)
        y.append(label)
data = {'phrase':X,'label':y}
df= pd.DataFrame(data)
#pretraitement des données 
import re 
arabic_diacritics = re.compile("""
                             ّ    | 
                             َ    | 
                             ً    | 
                             ُ    | 
                             ٌ    | 
                             ِ    | 
                             ٍ    | 
                             ْ    | 
                             ـ     
                         """, re.VERBOSE)
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
listtrain=[]
#supprime les diactritics 
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text
#supprime les signes des ponctuations 
def remove_punctuations(text):
    translator = str.maketrans('', '', arabic_punctuations)
    return text.translate(translator)
#Supprimer les nombres 
def remove_numbers(text):
    regex = re.compile(r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+")
    return re.sub(regex, ' ', text)
#supprimer les noms non arabe 
def remove_non_arabic_words(text):
    return ' '.join([word for word in text.split() if not re.findall(
        r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]',
        word)])
import tashaphyne ,sys 
import tashaphyne.arabic_const as arabcons
def strip_tashkeel(text): 
    return tashaphyne.arabic_const.HARAKAT_PAT.sub('', text) 
def strip_tatweel(text): 
    return re.sub(tashaphyne.arabic_const.TATWEEL, '', text) 
def normalize_hamza(text):
    text = tashaphyne.arabic_const.ALEFAT_PAT.sub(tashaphyne.arabic_const.ALEF, text) 
    return tashaphyne.arabic_const.HAMZAT_PAT.sub(tashaphyne.arabic_const.HAMZA, text) 
def normalize_lamalef(text):
    return tashaphyne.arabic_const.LAMALEFAT_PAT.sub(\
                                       u'%s%s'%(tashaphyne.arabic_const.LAM, tashaphyne.arabic_const.ALEF), text) 
def normalize_spellerrors(text): 
    text = re.sub(tashaphyne.arabic_const.TEH_MARBUTA,tashaphyne.arabic_const.HEH, text) 
    return re.sub(tashaphyne.arabic_const.ALEF_MAKSURA,tashaphyne.arabic_const.YEH, text)

train = list(df['phrase'])
for data in train:
    data = normalize_spellerrors(data)
    data = normalize_lamalef(data)
    data = normalize_hamza(data)
    data = strip_tashkeel(data)
    data = strip_tatweel(data)
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtrain.append(data)


In [6]:
#reverser les données récupérée 
#définir champ "step" sous la forme [start, stop, step] et ne donner aucun champ au début et à la fin, indique par défaut 0 et la longueur de la chaîne, et " -1 " indique un début et une fin à la fin. 
def reversed_string(a_string):
    return a_string[::-1]
print(reversed_string('وهيك عملت'))
listrev=[]
for i in listtrain :
    listrev.append(reversed_string(i))
df['phrase']= listrev

تلمع كيهو


In [7]:
import kenlm , math, glob 
import os,sys
import pandas as pd
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""
    def fit(self, x, y=None):
        return self
    
    def transform(self, X):
        res=[{'score': x[0],'model':x[1]} 
            for x in X[[ 'score','model']].values]
        return res
    
class ItemSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
    def fit(self, x, y=None):
        return self
    
    def transform(self, data_dict):
        return data_dict[self.key]

In [8]:
import kenlm , math, glob 
import os,sys
import pandas as pd
# function qui return la meilleur score+ le nom d model 
corp = ['ale-char', 'asw-char', 'msa-char', 'sal-char', 'jed-char', 'bag-char', 'san-char', 'kha-char', 'rab-char', 'mus-char', 'jer-char', 'bas-char', 'tri-char', 'alg-char', 'riy-char', 'amm-char', 'tun-char', 'fes-char', 'ben-char', 'alx-char', 'sfx-char', 'dam-char', 'bei-char', 'cai-char', 'mos-char', 'doh-char']
#modification par medellm reverse
def scoremodel(s):
    m = list(map(lambda code: kenlm.LanguageModel('/media/sameh/data/CORPUS/LM-corpus/model-lm-reverse/' + code + ".binary"), corp))

    s = ' '.join('#'.join(s.split()))
    maxl = ''
    maxp =  -sys.maxsize + 1 #minimum integer in python
    totalp = 0.0
    for j in range(len(m)):
        model = m[j]
        prob = model.score(s)
        totalp += math.pow(10.0, prob)
        if(prob > maxp):
            maxp = prob
            maxl = corp[j]
    #if else yetna7aw et round to proba sera dans boucle for pui l'ajout dans liste  
    if(totalp==0.0): prob = 0.0
    else: prob = math.pow(10.0,maxp)/totalp
    #round to thousandths
    prob = math.floor(1000*prob)/1000
    tupler=(maxl,prob)
    return tupler       
#ajouter au df 2 column pour score et label du model par phrase
listscoreph=[ ]
listmodelph=[ ]
for i in list(df['phrase']):
    listscoreph.append(scoremodel(i)[1])
    listmodelph.append(scoremodel(i)[0])
df['score'] = listscoreph
df['model'] = listmodelph
print(df.shape)

(41600, 4)


In [9]:
from sklearn.naive_bayes import MultinomialNB
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]
pipeline = Pipeline([

      ('features', FeatureUnion(
           transformer_list =[
       ('feature1', Pipeline([
                    ('selector', ItemSelector(key='phrase')),
                    ('tfidf', TfidfVectorizer(tokenizer=stemming_tokenizer,analyzer='word',ngram_range=(1,1))),
        ])),
       ('feature2', Pipeline([
                    ('stats',TextStats()), 
                    ('vect', DictVectorizer())
       ]))     
           ]
)),
    ('clasifier', MultinomialNB(alpha=0.5,fit_prior=True, class_prior=None)),
                 ])

pipeline.fit(df, df['label'])

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('feature1', Pipeline(memory=None,
     steps=[('selector', ItemSelector(key='phrase')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inp...ansformer_weights=None)), ('clasifier', MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True))])

In [10]:
TRAIN_SET_PATH = "/media/sameh/data/france/madar_shared_task/MADAR-Shared-Task-Subtask-1/MADAR-Corpus-26-dev.tsv"
with open(TRAIN_SET_PATH, "r", encoding='utf_8') as infile:
    xt,yt = [],[]
    for line in infile:
        text, label  = line.split("\t")
        xt.append(text)
        yt.append(label)
#recupére donée de test dans data frame 
datatest = {'phrase':xt,'label':yt}
dfdev= pd.DataFrame(datatest)
#recupérer les phrasees pour prétraitement
test = list(dfdev['phrase'])
listtest=[]
for data in test:
    data = normalize_spellerrors(data)
    data = normalize_lamalef(data)
    data = normalize_hamza(data)
    data = strip_tashkeel(data)
    data = strip_tatweel(data)
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtest.append(data)
#inverser les données prétraité pour l'ajouter dans dataframe de dev   
listdevrev=[]
for i in listtest :
    listdevrev.append(reversed_string(i))
dfdev['phrase']= listdevrev
#ajouter au df 2 column pour score et label du model par phrase
listscoredev=[ ]
listmodeldev=[ ]
for i in list(dfdev['phrase']):
    listscoredev.append(scoremodel(i)[1])
    listmodeldev.append(scoremodel(i)[0])
dfdev['score'] = listscoredev
dfdev['model'] = listmodeldev
print(dfdev.shape)

(5200, 4)


In [11]:
predicted = pipeline.predict(dfdev)
print (len(predicted))
print('Accuracy = ',np.mean(predicted == dfdev['label']))

5200
Accuracy =  0.8521153846153846


In [8]:
#1ér test : test des données reversé par tfidfvectorizer() seul (pour word de 1 à 5 gramme) => tjr résulat ne change pas : 64.27%
#j'ai ajoué la normalization des données comme la création des modéles reversé dans la prétraitement 
#2ém test : test de l'union des features tfidfvectorizer() + LMs reversé => dimunie de 85.35% à 85.21% 


In [12]:
print(df)

                                                  phrase  label  score  \
0                               امامت حءاسلا تانايب ماما  MSA\n  0.999   
1              انه نم برقلاب لبق نم ناونعلا اذهب عمسا مل  MSA\n  0.999   
2            هيلديص دجت يتح قيرطلا اذه يف ريسلا يف رمتسا  MSA\n  0.999   
3                                       راطفالا هفلكت مك  MSA\n  0.720   
4                                     كتدعاسم عيطتسا فيك  MSA\n  0.982   
5                         هثلاثلا هيصانلا دنع اراسي هجتا  MSA\n  0.999   
6                       كتوهق يف ركسو هدشق عضت نا بحت له  MSA\n  0.999   
7          يب صاخلا رالود يتءاملا وذ كيشلا فرص مكنكمي له  MSA\n  0.999   
8                            يب لصتا كلضف نم كلذ ثدح اذا  MSA\n  0.999   
9                                             يهقملا نيا  MSA\n  0.283   
10                                    لافطالل تيكاج ديرا  MSA\n  0.993   
11     صاخلا ليصوتلا يف تارالود هثالثو يداعلا ديربلا ...  MSA\n  0.999   
12                   مدقلا عباصا دنع ي