In [45]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import numpy as np
import re
import string
import sys

In [46]:
#phase de prétraitement 
corpus = load_files('D:/CORPUS/MADAR-Shared-Task-Subtask-1/', encoding = 'utf-8',decode_error='ignore')

arabic_diacritics = re.compile("""
                             ّ    | 
                             َ    | 
                             ً    | 
                             ُ    | 
                             ٌ    | 
                             ِ    | 
                             ٍ    | 
                             ْ    | 
                             ـ     
                         """, re.VERBOSE)
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
list=[]

#supprime les diactritics 
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

#supprime les signes des ponctuations 
def remove_punctuations(text):
    translator = str.maketrans('', '', arabic_punctuations)
    return text.translate(translator)

for data in corpus.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    list.append(data)

#Tokenizing text with scikit-learn
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(list)


In [47]:
#From occurrences to frequencies
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [48]:
#Training a classifier

clf = MultinomialNB().fit(X_train_tfidf, corpus.target)


In [49]:
Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()), ])
text_clf.fit(list, corpus.target)


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [51]:
# Evaluation de performance sur donnée de test 
Dtest = load_files('D:/CORPUS/test/',encoding='utf-8',decode_error='ignore')
listtest=[]
for data in Dtest.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    listtest.append(data)
predicted = text_clf.predict(listtest)
print('Accuracy = ',np.mean(predicted == Dtest.target))

Accuracy =  1.0


In [52]:
# Mesure d'evaluation 
print(metrics.classification_report(Dtest.target, predicted, target_names=Dtest.target_names))


              precision    recall  f1-score   support

         ALE       1.00      1.00      1.00         1
         ALG       1.00      1.00      1.00         1
         ALX       1.00      1.00      1.00         1
         AMM       1.00      1.00      1.00         1
         ASW       1.00      1.00      1.00         1
         BAG       1.00      1.00      1.00         1
         BAS       1.00      1.00      1.00         1
         BEI       1.00      1.00      1.00         1
         BEN       1.00      1.00      1.00         1
         CAI       1.00      1.00      1.00         1
         DAM       1.00      1.00      1.00         1
         DOH       1.00      1.00      1.00         1
         FES       1.00      1.00      1.00         1
         JED       1.00      1.00      1.00         1
         JER       1.00      1.00      1.00         1
         KHA       1.00      1.00      1.00         1
         MOS       1.00      1.00      1.00         1
         MSA       1.00    