In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.datasets import fetch_20newsgroups

## Travailler avec des données texte 

L'objectif de ce guide est d'explorer quelques-uns des principaux scikit-learn outils sur une seule tâche pratique : analyser une collection de documents texte (articles de groupes de discussion) sur vingt sujets différents.

In [5]:
categories = ['alt.atheism', 'soc.religion.christian',
               'comp.graphics', 'sci.med']

In [7]:
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)

In [9]:
# twenty_train
twenty_train.target_names

['alt.atheism', 'comp.graphics', 'sci.med', 'soc.religion.christian']

In [10]:
len(twenty_train.data)

2257

In [11]:
len(twenty_train.filenames)

2257

In [19]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

# print(twenty_train.target_names[twenty_train.target[0]])

From: sd345@city.ac.uk (Michael Collier)________Subject: Converting images to HP LaserJet III?________Nntp-Posting-Host: hampton


In [13]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [14]:
for t in twenty_train.target[:10]:
    
    print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [28]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(twenty_train.data)
print(X_train_counts)
X_train_counts.shape

  (0, 14887)	1
  (0, 29022)	1
  (0, 8696)	4
  (0, 4017)	2
  (0, 33256)	2
  (0, 21661)	3
  (0, 9031)	3
  (0, 31077)	1
  (0, 9805)	2
  (0, 17366)	1
  (0, 32493)	4
  (0, 16916)	2
  (0, 19780)	2
  (0, 17302)	2
  (0, 23122)	1
  (0, 25663)	1
  (0, 16881)	1
  (0, 16082)	1
  (0, 23915)	1
  (0, 32142)	5
  (0, 33597)	2
  (0, 20253)	1
  (0, 587)	1
  (0, 12051)	1
  (0, 5201)	1
  :	:
  (2256, 13740)	1
  (2256, 14662)	1
  (2256, 20201)	1
  (2256, 12443)	6
  (2256, 30325)	3
  (2256, 4610)	1
  (2256, 33844)	1
  (2256, 17354)	1
  (2256, 26998)	1
  (2256, 20277)	1
  (2256, 20695)	1
  (2256, 20702)	1
  (2256, 9649)	1
  (2256, 9086)	1
  (2256, 26254)	1
  (2256, 17133)	2
  (2256, 4490)	1
  (2256, 13720)	1
  (2256, 5016)	1
  (2256, 9632)	1
  (2256, 11824)	1
  (2256, 29993)	1
  (2256, 1298)	1
  (2256, 2375)	1
  (2256, 3921)	1


(2257, 35788)

In [39]:
count_vect.vocabulary_.get(u'got')

15601

In [29]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(2257, 35788)

In [30]:
print(X_train_tf)

  (0, 177)	0.15075567228888181
  (0, 230)	0.07537783614444091
  (0, 587)	0.07537783614444091
  (0, 2326)	0.15075567228888181
  (0, 3062)	0.07537783614444091
  (0, 3166)	0.07537783614444091
  (0, 4017)	0.15075567228888181
  (0, 4378)	0.07537783614444091
  (0, 4808)	0.07537783614444091
  (0, 5195)	0.07537783614444091
  (0, 5201)	0.07537783614444091
  (0, 5285)	0.07537783614444091
  (0, 8696)	0.30151134457776363
  (0, 9031)	0.22613350843332272
  (0, 9338)	0.07537783614444091
  (0, 9801)	0.07537783614444091
  (0, 9805)	0.15075567228888181
  (0, 9932)	0.07537783614444091
  (0, 12014)	0.07537783614444091
  (0, 12051)	0.07537783614444091
  (0, 12541)	0.07537783614444091
  (0, 12833)	0.15075567228888181
  (0, 14085)	0.07537783614444091
  (0, 14281)	0.15075567228888181
  (0, 14676)	0.07537783614444091
  :	:
  (2256, 24052)	0.07216878364870323
  (2256, 25560)	0.07216878364870323
  (2256, 26254)	0.07216878364870323
  (2256, 26998)	0.07216878364870323
  (2256, 27031)	0.14433756729740646
  (2256, 2

In [31]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(2257, 35788)

In [32]:
print(X_train_tfidf)

  (0, 35416)	0.1348710554299733
  (0, 35312)	0.0312703097833574
  (0, 34775)	0.034481472140846715
  (0, 34755)	0.043341654399042764
  (0, 33915)	0.0999409997803694
  (0, 33597)	0.06567578043186388
  (0, 33572)	0.09313007554599557
  (0, 33256)	0.11819702490105698
  (0, 32493)	0.07283773941616518
  (0, 32391)	0.12806013119559947
  (0, 32270)	0.023871142738151236
  (0, 32142)	0.08865416253721688
  (0, 32135)	0.04910237380446671
  (0, 32116)	0.10218403421141944
  (0, 31915)	0.08631915131162177
  (0, 31077)	0.016797806021219684
  (0, 30623)	0.0686611288079694
  (0, 29022)	0.1348710554299733
  (0, 28619)	0.047271576160535234
  (0, 27836)	0.06899050810672397
  (0, 26175)	0.08497460943470851
  (0, 25663)	0.034290706362898604
  (0, 25361)	0.11947938145690981
  (0, 25337)	0.04935883383975408
  (0, 24677)	0.09796250319482307
  :	:
  (2256, 13720)	0.0969927054646086
  (2256, 13521)	0.06264742916622883
  (2256, 13498)	0.08574361554718753
  (2256, 12626)	0.047531848081675473
  (2256, 12443)	0.553384

In [None]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)

In [None]:
docs_new = ['God is love', 'OpenGL on the GPU is fast']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)

for doc, category in zip(docs_new, predicted):
    print('%r => %s' % (doc, twenty_train.target_names[category]))

In [None]:
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
    ])

In [None]:
text_clf.fit(twenty_train.data, twenty_train.target)

In [None]:
>>> import numpy as np
>>> twenty_test = fetch_20newsgroups(subset='test',
...     categories=categories, shuffle=True, random_state=42)
>>> docs_test = twenty_test.data
>>> predicted = text_clf.predict(docs_test)
>>> np.mean(predicted == twenty_test.target)

In [None]:
>>> from sklearn.linear_model import SGDClassifier
>>> text_clf = Pipeline([
...     ('vect', CountVectorizer()),
...     ('tfidf', TfidfTransformer()),
...     ('clf', SGDClassifier(loss='hinge', penalty='l2',
...                           alpha=1e-3, random_state=42,
...                           max_iter=5, tol=None)),
... ])

>>> text_clf.fit(twenty_train.data, twenty_train.target)
Pipeline(...)
>>> predicted = text_clf.predict(docs_test)
>>> np.mean(predicted == twenty_test.target)