In [1]:
from __future__ import print_function
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.datasets import load_files
import re

#phase de prétraitement 
corpus = load_files('/media/sameh/data/split/train26/', encoding = 'utf-8',decode_error='ignore')

arabic_diacritics = re.compile("""
                             ّ    | 
                             َ    | 
                             ً    | 
                             ُ    | 
                             ٌ    | 
                             ِ    | 
                             ٍ    | 
                             ْ    | 
                             ـ     
                         """, re.VERBOSE)
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
listc=[]

#supprime les diactritics 
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

#supprime les signes des ponctuations 
def remove_punctuations(text):
    translator = str.maketrans('', '', arabic_punctuations)
    return text.translate(translator)

#Supprimer les nombres 
def remove_numbers(text):
    regex = re.compile(r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+")
    return re.sub(regex, ' ', text)

#supprimer les noms non arabe 
def remove_non_arabic_words(text):
    return ' '.join([word for word in text.split() if not re.findall(
        r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]',
        word)])

for data in corpus.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listc.append(data)
    # Evaluation de performance sur donnée de test 
Dtest = load_files('/media/sameh/data/split/dev26/',encoding='utf-8',decode_error='ignore')
listtest=[]
for data in Dtest.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtest.append(data)

In [6]:

import kenlm , math, glob 
import os,sys
import pandas as pd
corp = ['ale-char', 'asw-char', 'msa-char', 'sal-char', 'jed-char', 'bag-char', 'san-char', 'kha-char', 'rab-char', 'mus-char', 'jer-char', 'bas-char', 'tri-char', 'alg-char', 'riy-char', 'amm-char', 'tun-char', 'fes-char', 'ben-char', 'alx-char', 'sfx-char', 'dam-char', 'bei-char', 'cai-char', 'mos-char', 'doh-char']

def scoremodel(s):
    m = list(map(lambda code: kenlm.LanguageModel('/media/sameh/data/CORPUS/LM-corpus/modelm/' + code + ".binary"), corp))

    s = ' '.join('#'.join(s.split()))
    maxl = ''
    maxp =  -sys.maxsize + 1 #minimum integer in python
    totalp = 0.0
    for j in range(len(m)):
        model = m[j]
        prob = model.score(s)
        totalp += math.pow(10.0, prob)
        if(prob > maxp):
            maxp = prob
            maxl = corp[j]
    #if else yetna7aw et round to proba sera dans boucle for pui l'ajout dans liste  
    if(totalp==0.0): prob = 0.0
    else: prob = math.pow(10.0,maxp)/totalp
    #round to thousandths
    prob = math.floor(1000*prob)/1000
    return maxl       
class TextStats(BaseEstimator, TransformerMixin):
    """Extract features from each document for DictVectorizer"""

    def fit(self, x, y=None):
        return self

    def transform(self, X ):
        listy=[ ]
        for i in X :
            listy.append(scoremodel(i))
        data= {'phrase':X,'score':listy}

        res= pd.DataFrame(data)
        
        return [{'score': x[0]} 
            for x in res[[ 'score']].values]


In [7]:
from sklearn.naive_bayes import MultinomialNB
 
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]
pipeline = Pipeline([

      ('features', FeatureUnion([  
          
      ('feature1', Pipeline([
      ('tfidf', TfidfVectorizer(tokenizer=stemming_tokenizer,analyzer='word',ngram_range=(1,1))),

 
    ])),
       ('feature2', Pipeline([
            ('stats',TextStats()), ('vect', DictVectorizer())
     ]))
        ])),
    ('clasifier', MultinomialNB(alpha=0.5,fit_prior=True, class_prior=None)),
                 ])
"""pipeline = Pipeline([
      ('model', Feature2()), ('clasifier', MultinomialNB(alpha=0.5,fit_prior=True, class_prior=None)), 
      #('vect', DictVectorizer()),  # list of dicts -> feature matrix  
            ])"""

pipeline.fit(listc, corpus.target)
y = pipeline.predict(listtest)

print (len(y))
print('Accuracy = ',np.mean(y == Dtest.target))
print(classification_report(y, Dtest.target_names))

5200
Accuracy =  0.8192307692307692
              precision    recall  f1-score   support

           0       0.81      0.81      0.81       201
           1       0.91      0.98      0.94       186
           2       0.68      0.76      0.71       178
           3       0.74      0.76      0.75       196
           4       0.81      0.74      0.78       218
           5       0.83      0.75      0.79       222
           6       0.82      0.89      0.85       184
           7       0.82      0.83      0.83       197
           8       0.80      0.89      0.84       179
           9       0.72      0.75      0.74       194
          10       0.67      0.82      0.74       164
          11       0.83      0.67      0.74       251
          12       0.82      0.89      0.85       186
          13       0.88      0.77      0.82       228
          14       0.77      0.71      0.73       217
          15       0.85      0.85      0.85       202
          16       0.91      0.85      0.88  