In [1]:
from __future__ import print_function
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer , CountVectorizer, TfidfTransformer
from sklearn.metrics import classification_report
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline

In [2]:
from sklearn.datasets import load_files
import re

#phase de prétraitement 
corpus = load_files('/media/sameh/data/split/train26/', encoding = 'utf-8',decode_error='ignore')

arabic_diacritics = re.compile("""
                             ّ    | 
                             َ    | 
                             ً    | 
                             ُ    | 
                             ٌ    | 
                             ِ    | 
                             ٍ    | 
                             ْ    | 
                             ـ     
                         """, re.VERBOSE)
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
list=[]

#supprime les diactritics 
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

#supprime les signes des ponctuations 
def remove_punctuations(text):
    translator = str.maketrans('', '', arabic_punctuations)
    return text.translate(translator)

#Supprimer les nombres 
def remove_numbers(text):
    regex = re.compile(r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+")
    return re.sub(regex, ' ', text)

#supprimer les noms non arabe 
def remove_non_arabic_words(text):
    return ' '.join([word for word in text.split() if not re.findall(
        r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]',
        word)])

for data in corpus.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    list.append(data)
    # Evaluation de performance sur donnée de test 
Dtest = load_files('/media/sameh/data/split/dev26/',encoding='utf-8',decode_error='ignore')
listtest=[]
for data in Dtest.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtest.append(data)

In [3]:
import kenlm , math, glob 
import pandas as pd

def language(list):
    listscore=[]
    for text in list:
        listproba=[]
        text = ' '.join('#'.join(text.split()))
        for nommodel in glob.glob('/media/sameh/data/CORPUS/LM-corpus/binary/*') :
   
            model=kenlm.LanguageModel(nommodel)
            prob = math.pow(10,model.score(text))
            #prob = math.floor(1000*prob)/1000
            listproba.append(prob)
            
        listscore.append(listproba)
    print('listscore',len(listscore))
    return listscore

    
class fctio( ):
    def fit(self, X, y=None):
        return self
    def transform(self, X, **transform_params):
        #return pd.DataFrame(data=language(X))
        return np.matrix(language(X))
    
    

In [7]:
from sklearn.naive_bayes import MultinomialNB
 
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]
pipeline = Pipeline([

      ('features', FeatureUnion([  
          
      ('feature1', Pipeline([
         ('vect', CountVectorizer(tokenizer=stemming_tokenizer,analyzer='word',ngram_range=(1,1))),
                     ('tfidf', TfidfTransformer())
 
    ])),
      ('feature2', Pipeline([
      ('model', fctio()),  
      #('vect', DictVectorizer()),  # list of dicts -> feature matrix  
            ]))
        ])),
    ('clasifier', MultinomialNB(alpha=0.5,fit_prior=True, class_prior=None)),
    ])
"""                

pipeline = Pipeline([
      ('model', fctio()),  ('clasifier', MultinomialNB(alpha=0.5,fit_prior=True, class_prior=None)), 
      #('vect', DictVectorizer()),  # list of dicts -> feature matrix  
            ])"""

pipeline.fit(list, corpus.target)
y = pipeline.predict(listtest)

print (len(y))
print('Accuracy = ',np.mean(y == Dtest.target))
print(classification_report(y, Dtest.target))

listscore 41600
listscore 5200
5200
Accuracy =  0.6426923076923077
              precision    recall  f1-score   support

           0       0.58      0.63      0.61       185
           1       0.81      0.73      0.77       223
           2       0.78      0.72      0.75       216
           3       0.52      0.47      0.50       220
           4       0.59      0.49      0.53       243
           5       0.59      0.75      0.66       159
           6       0.65      0.68      0.66       189
           7       0.58      0.77      0.66       151
           8       0.75      0.67      0.71       223
           9       0.47      0.63      0.54       147
          10       0.54      0.64      0.58       166
          11       0.62      0.60      0.61       207
          12       0.69      0.65      0.67       211
          13       0.63      0.64      0.63       197
          14       0.59      0.46      0.51       259
          15       0.72      0.53      0.61       273
          16  