In [1]:
from __future__ import print_function
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

In [2]:
#recupération des données
TRAIN_SET_PATH = "/media/sameh/data/france/madar_shared_task/MADAR-Shared-Task-Subtask-1/MADAR-Corpus-26-train.txt"
import pandas as pd

with open(TRAIN_SET_PATH, "r", encoding='utf_8') as infile:
    X, y = [], []
    for line in infile:
        text, label  = line.split("\t")
        X.append(text)
        y.append(label)
data = {'phrase':X,'label':y}
df= pd.DataFrame(data)


In [3]:
#pretraitement des données 
import re 
arabic_diacritics = re.compile("""
                             ّ    | 
                             َ    | 
                             ً    | 
                             ُ    | 
                             ٌ    | 
                             ِ    | 
                             ٍ    | 
                             ْ    | 
                             ـ     
                         """, re.VERBOSE)
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
listtrain=[]
#supprime les diactritics 
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text
#supprime les signes des ponctuations 
def remove_punctuations(text):
    translator = str.maketrans('', '', arabic_punctuations)
    return text.translate(translator)
#Supprimer les nombres 
def remove_numbers(text):
    regex = re.compile(r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+")
    return re.sub(regex, ' ', text)
#supprimer les noms non arabe 
def remove_non_arabic_words(text):
    return ' '.join([word for word in text.split() if not re.findall(
        r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]',
        word)])
train = list(df['phrase'])
for data in train:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtrain.append(data)


In [4]:
df['phrase']= listtrain

In [5]:
from sklearn.naive_bayes import MultinomialNB
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]
#Pipeline
text_clf = Pipeline(memory=None,
         steps=[ ('vect', CountVectorizer(tokenizer=stemming_tokenizer,analyzer='word',ngram_range=(1,1))),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha=0.5,fit_prior=True, class_prior=None)) ])
text_clf.fit(df['phrase'], df['label'])

Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True))])

In [6]:
TRAIN_SET_PATH = "/media/sameh/data/france/madar_shared_task/MADAR-Shared-Task-Subtask-1/MADAR-Corpus-26-dev.tsv"
with open(TRAIN_SET_PATH, "r", encoding='utf_8') as infile:
    xt,yt = [],[]
    for line in infile:
        text, label  = line.split("\t")
        xt.append(text)
        yt.append(label)
#recupére donée de test dans data frame 
datatest = {'phrase':xt,'label':yt}
dftest= pd.DataFrame(datatest)
test = list(dftest['phrase'])
listtest=[]
for data in test:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtest.append(data)
dftest['phrase']= listtest
predicted = text_clf.predict(dftest['phrase'])
print (len(predicted))
print('Accuracy = ',np.mean(predicted == dftest['label']))

5200
Accuracy =  0.6426923076923077


In [7]:
#function qui return la liste de score de chaque phrase
import kenlm , math, glob 
import os,sys
import pandas as pd
def language(text):
    text = ' '.join('#'.join(text.split()))
    listscore=[]
    for nommodel in glob.glob('/media/sameh/data/CORPUS/LM-corpus/modelm/*.binary') :
        model=kenlm.LanguageModel(nommodel)
        prob = math.pow(10,model.score(text))
        #prob = math.floor(1000*prob)/1000            
        listscore.append(prob)
    #print('listscore',len(listscore))
    return listscore


class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, X):
        res=[{'score': x[0]} 
            for x in X[[ 'score']].values]
        return res

class ItemSelector(BaseEstimator, TransformerMixin):

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

# function qui return la meilleur score+ le nom d model 
corp = ['ale-char', 'asw-char', 'msa-char', 'sal-char', 'jed-char', 'bag-char', 'san-char', 'kha-char', 'rab-char', 'mus-char', 'jer-char', 'bas-char', 'tri-char', 'alg-char', 'riy-char', 'amm-char', 'tun-char', 'fes-char', 'ben-char', 'alx-char', 'sfx-char', 'dam-char', 'bei-char', 'cai-char', 'mos-char', 'doh-char']
#print(models)
def scoremodel(s):
    m = list(map(lambda code: kenlm.LanguageModel('/media/sameh/data/CORPUS/LM-corpus/modelm/' + code + ".binary"), corp))

    s = ' '.join('#'.join(s.split()))
    maxl = ''
    maxp =  -sys.maxsize + 1 #minimum integer in python
    totalp = 0.0
    for j in range(len(m)):
        model = m[j]
        prob = model.score(s)
        totalp += math.pow(10.0, prob)
        if(prob > maxp):
            maxp = prob
            maxl = corp[j]
    #if else yetna7aw et round to proba sera dans boucle for pui l'ajout dans liste  
    if(totalp==0.0): prob = 0.0
    else: prob = math.pow(10.0,maxp)/totalp
    #round to thousandths
    prob = math.floor(1000*prob)/1000
    return maxl       

In [8]:
#ajouter au df 1 column pour 26 score par phrase
listscoreph=[ ]
for i in list(df['phrase']):
    listscoreph.append(scoremodel(i))
df['score'] = listscoreph 
print(df.shape)

(41600, 3)


In [9]:

pipeline = Pipeline([

      ('features', FeatureUnion(
           transformer_list =[

       ('feature1', Pipeline([
                    ('selector', ItemSelector(key='phrase')),
('tfidf', TfidfVectorizer(tokenizer=stemming_tokenizer,analyzer='word',ngram_range=(1,1))),
        ])),
       ('feature2', Pipeline([
            ('stats',TextStats()), ('vect', DictVectorizer())

       ]))
           ]
)),

    ('clasifier', MultinomialNB(alpha=0.5,fit_prior=True, class_prior=None)),
                 ])

pipeline.fit(df, df['label'])

Pipeline(memory=None,
     steps=[('features', FeatureUnion(n_jobs=None,
       transformer_list=[('feature1', Pipeline(memory=None,
     steps=[('selector', ItemSelector(key='phrase')), ('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', inp...ansformer_weights=None)), ('clasifier', MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True))])

In [10]:
#pour evaluation 
listscoretest=[ ]

for i in list(dftest['phrase']):
    listscoretest.append(scoremodel(i))
dftest['score'] = listscoretest


In [11]:
y = pipeline.predict(dftest)
print('Accuracy = ',np.mean(y == dftest['label']))
print(classification_report(y, dftest['label']))

Accuracy =  0.8192307692307692
              precision    recall  f1-score   support

        ALE
       0.81      0.80      0.81       202
        ALG
       0.91      0.98      0.94       186
        ALX
       0.68      0.76      0.71       178
        AMM
       0.74      0.76      0.75       195
        ASW
       0.81      0.74      0.78       218
        BAG
       0.83      0.75      0.79       223
        BAS
       0.82      0.89      0.85       184
        BEI
       0.82      0.83      0.83       197
        BEN
       0.80      0.89      0.84       179
        CAI
       0.72      0.75      0.74       194
        DAM
       0.67      0.82      0.74       164
        DOH
       0.83      0.67      0.74       251
        FES
       0.82      0.89      0.85       186
        JED
       0.88      0.80      0.84       218
        JER
       0.77      0.70      0.73       218
        KHA
       0.85      0.84      0.85       203
        MOS
       0.91      0.85      0.88       

In [12]:
#64.48% par intégration score de chaque phrase comme feature 
#81.92% par intégration de label de modéle de chaque phrase comme feature 
#print(dftest)