In [15]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.datasets import load_files
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
import numpy as np
import re
import string
import sys

In [16]:
#phase de prétraitement 
corpus = load_files('D:/CORPUS/MADAR-Shared-Task-Subtask-1/', encoding = 'utf-8',decode_error='ignore')

arabic_diacritics = re.compile("""
                             ّ    | 
                             َ    | 
                             ً    | 
                             ُ    | 
                             ٌ    | 
                             ِ    | 
                             ٍ    | 
                             ْ    | 
                             ـ     
                         """, re.VERBOSE)
arabic_punctuations = '''`÷×؛<>_()*&^%][ـ،/:"؟.,'{}~¦+|!”…“–ـ'''
list=[]

#supprime les diactritics 
def remove_diacritics(text):
    text = re.sub(arabic_diacritics, '', text)
    return text

#supprime les signes des ponctuations 
def remove_punctuations(text):
    translator = str.maketrans('', '', arabic_punctuations)
    return text.translate(translator)

#Supprimer les nombres 
def remove_numbers(text):
    regex = re.compile(r"(\d|[\u0660\u0661\u0662\u0663\u0664\u0665\u0666\u0667\u0668\u0669])+")
    return re.sub(regex, ' ', text)

#supprimer les noms non arabe 
def remove_non_arabic_words(text):
    return ' '.join([word for word in text.split() if not re.findall(
        r'[^\s\u0621\u0622\u0623\u0624\u0625\u0626\u0627\u0628\u0629\u062A\u062B\u062C\u062D\u062E\u062F\u0630\u0631\u0632\u0633\u0634\u0635\u0636\u0637\u0638\u0639\u063A\u0640\u0641\u0642\u0643\u0644\u0645\u0646\u0647\u0648\u0649\u064A]',
        word)])

for data in corpus.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    list.append(data)
#print(list)
#Tokenizing text with scikit-learn
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(list)
print(corpus.target_names)

['ALE', 'ALG', 'ALX', 'AMM', 'ASW', 'BAG', 'BAS', 'BEI', 'BEN', 'CAI', 'DAM', 'DOH', 'FES', 'JED', 'JER', 'KHA', 'MOS', 'MSA', 'MUS', 'RAB', 'RIY', 'SAL', 'SAN', 'SFX', 'TRI', 'TUN']


In [17]:
#From occurrences to frequencies
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)


In [18]:
#Training a classifier
clf = MultinomialNB().fit(X_train_tfidf, corpus.target)

In [19]:
#Pipeline
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()), ])
text_clf.fit(list, corpus.target)


Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip...inear_tf=False, use_idf=True)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [20]:
# Evaluation de performance sur donnée de test 
Dtest = load_files('D:/CORPUS/dev6-test/',encoding='utf-8',decode_error='ignore')
listtest=[]
for data in Dtest.data:
    data = remove_diacritics(data)
    data = remove_punctuations(data)
    data = remove_numbers(data)
    data = remove_non_arabic_words(data)
    listtest.append(data)
predicted = text_clf.predict(listtest)
print('Accuracy = ',np.mean(predicted == Dtest.target))


Accuracy =  0.125


In [21]:
# Mesure d'evaluation 
print(metrics.classification_report(Dtest.target, predicted, target_names=Dtest.target_names))


              precision    recall  f1-score   support

         CAI       0.12      1.00      0.22         1
         DOH       0.00      0.00      0.00         2
         MSA       0.00      0.00      0.00         1
         RAB       0.00      0.00      0.00         1
         TUN       0.00      0.00      0.00         3

   micro avg       0.12      0.12      0.12         8
   macro avg       0.03      0.20      0.04         8
weighted avg       0.02      0.12      0.03         8



  'precision', 'predicted', average, warn_for)
