In [1]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import numpy as np
import re
import string
import sys

In [2]:
#phase de prétraitement 
corpus = load_files('D:/split/train26/', encoding = 'utf-8',decode_error='ignore')

arabic_diacritics = re.compile("""
                             ّ    | 
                             َ    | 
                             ً    | 
                             ُ    | 
                             ٌ    | 
                             ِ    | 
                             ٍ    | 
                             ْ    | 
                             ـ     
                         """, re.VERBOSE)
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
list=[]

#supprime les diactritics 
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

#supprime les signes des ponctuations 
def remove_punctuations(text):
    translator = str.maketrans('', '', arabic_punctuations)
    return text.translate(translator)

#Supprimer les nombres 
def remove_numbers(text):
    regex = re.compile(r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+")
    return re.sub(regex, ' ', text)

#supprimer les noms non arabe 
def remove_non_arabic_words(text):
    return ' '.join([word for word in text.split() if not re.findall(
        r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]',
        word)])

for data in corpus.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    list.append(data)
#print(list)
#Tokenizing text with scikit-learn
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3))
X_train_counts = count_vect.fit_transform(list)
print(corpus.target_names)

['ALE', 'ALG', 'ALX', 'AMM', 'ASW', 'BAG', 'BAS', 'BEI', 'BEN', 'CAI', 'DAM', 'DOH', 'FES', 'JED', 'JER', 'KHA', 'MOS', 'MSA', 'MUS', 'RAB', 'RIY', 'SAL', 'SAN', 'SFX', 'TRI', 'TUN']


In [3]:
#From occurrences to frequencies
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [4]:
#Training a classifier
clf = MultinomialNB().fit(X_train_tfidf, corpus.target)

In [5]:
import string
from nltk.stem import PorterStemmer
from nltk import word_tokenize
def stemming_tokenizer(text):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in word_tokenize(text)]

#Pipeline
text_clf = Pipeline([('vect', CountVectorizer(tokenizer=stemming_tokenizer)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB(alpha=0.5)) ])
text_clf.fit(list, corpus.target)


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=0.5, class_prior=None, fit_prior=True))])

In [8]:
# Evaluation de performance sur donnée de test 
Dtest = load_files('D:/split/dev26/',encoding='utf-8',decode_error='ignore')
listtest=[]
for data in Dtest.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtest.append(data)
predicted = text_clf.predict(listtest)
print (len(predicted))
print('Accuracy = ',np.mean(predicted == Dtest.target))


5200
Accuracy =  0.6426923076923077


In [9]:
# Mesure d'evaluation 
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
print(metrics.classification_report(Dtest.target, predicted, target_names=Dtest.target_names))


              precision    recall  f1-score   support

         ALE       0.63      0.58      0.61       200
         ALG       0.73      0.81      0.77       200
         ALX       0.72      0.78      0.75       200
         AMM       0.47      0.52      0.50       200
         ASW       0.49      0.59      0.53       200
         BAG       0.75      0.59      0.66       200
         BAS       0.68      0.65      0.66       200
         BEI       0.77      0.58      0.66       200
         BEN       0.67      0.75      0.71       200
         CAI       0.63      0.47      0.54       200
         DAM       0.64      0.54      0.58       200
         DOH       0.60      0.62      0.61       200
         FES       0.65      0.69      0.67       200
         JED       0.64      0.63      0.63       200
         JER       0.46      0.59      0.51       200
         KHA       0.53      0.72      0.61       200
         MOS       0.83      0.79      0.81       200
         MSA       0.62    