# Pre-processing using TfidfVectorizer

In [1]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [2]:
from sklearn.datasets import fetch_20newsgroups
twenty_train = fetch_20newsgroups(subset='train',categories=categories, shuffle=True, random_state=42)

In [3]:
twenty_train.target_names
twenty_train.data[0]
#As you can see, the example trains the model with the target simultaneously
twenty_train.target

array([1, 1, 3, ..., 2, 2, 2], dtype=int64)

In [4]:
len(twenty_train.data)

2257

In [5]:
len(twenty_train.filenames)

2257

In [6]:
print("\n".join(twenty_train.data[0].split("\n")[:3]))

From: sd345@city.ac.uk (Michael Collier)
Subject: Converting images to HP LaserJet III?
Nntp-Posting-Host: hampton


In [7]:
print(twenty_train.target_names[twenty_train.target[0]])

comp.graphics


In [8]:
twenty_train.target[:10]

array([1, 1, 3, 3, 3, 3, 3, 2, 2, 2], dtype=int64)

In [9]:
for t in twenty_train.target[:10]:
...     print(twenty_train.target_names[t])

comp.graphics
comp.graphics
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
soc.religion.christian
sci.med
sci.med
sci.med


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer = TfidfVectorizer(min_df=1)
vectorizer.fit_transform(twenty_train.data)
# print(type(vectorizer))
vectorizer.vocabulary_

<class 'sklearn.feature_extraction.text.TfidfVectorizer'>


{'from': 14887,
 'sd345': 29022,
 'city': 8696,
 'ac': 4017,
 'uk': 33256,
 'michael': 21661,
 'collier': 9031,
 'subject': 31077,
 'converting': 9805,
 'images': 17366,
 'to': 32493,
 'hp': 16916,
 'laserjet': 19780,
 'iii': 17302,
 'nntp': 23122,
 'posting': 25663,
 'host': 16881,
 'hampton': 16082,
 'organization': 23915,
 'the': 32142,
 'university': 33597,
 'lines': 20253,
 '14': 587,
 'does': 12051,
 'anyone': 5201,
 'know': 19458,
 'of': 23610,
 'good': 15576,
 'way': 34755,
 'standard': 30623,
 'pc': 24651,
 'application': 5285,
 'pd': 24677,
 'utility': 33915,
 'convert': 9801,
 'tif': 32391,
 'img': 17389,
 'tga': 32116,
 'files': 14281,
 'into': 18268,
 'format': 14676,
 'we': 34775,
 'would': 35312,
 'also': 4808,
 'like': 20198,
 'do': 12014,
 'same': 28619,
 'hpgl': 16927,
 'plotter': 25361,
 'please': 25337,
 'email': 12833,
 'any': 5195,
 'response': 27836,
 'is': 18474,
 'this': 32270,
 'correct': 9932,
 'group': 15837,
 'thanks': 32135,
 'in': 17556,
 'advance': 4378,

In [16]:
#We firstly use the fit(..) method to fit our estimator to the data 
#Learn the idf vector (global term weights)

#idf = the inverse of the value how often this term appears on the document
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)

In [17]:
#Secondly the transform(..) method to transform our count-matrix to a tf-idf representation

#This generates the tf-idf matrix
#Can I retrieve the array of words? 

X_train_tf = tf_transformer.transform(X_train_counts)

In [48]:
#For each row, find the list of words in a descending order of values  
#To do so, I need to retrieve the each row's value
#To do so, I need to be able to retrive the word at [x,y]

print(type(X_train_tf))
# X_train_tf.nnz
# X_train_tf.asformat(csr)
# X_train_tf.getrow(0)
X_train_tf.asfptype().toarray()
print(X_train_tf)

<class 'scipy.sparse.csr.csr_matrix'>
  (0, 230)	0.0753778361444
  (0, 12541)	0.0753778361444
  (0, 3166)	0.0753778361444
  (0, 14085)	0.0753778361444
  (0, 20459)	0.0753778361444
  (0, 35416)	0.0753778361444
  (0, 3062)	0.0753778361444
  (0, 2326)	0.150755672289
  (0, 177)	0.150755672289
  (0, 31915)	0.0753778361444
  (0, 33572)	0.0753778361444
  (0, 9338)	0.0753778361444
  (0, 26175)	0.0753778361444
  (0, 4378)	0.0753778361444
  (0, 17556)	0.0753778361444
  (0, 32135)	0.0753778361444
  (0, 15837)	0.0753778361444
  (0, 9932)	0.0753778361444
  (0, 32270)	0.0753778361444
  (0, 18474)	0.0753778361444
  (0, 27836)	0.0753778361444
  (0, 5195)	0.0753778361444
  (0, 12833)	0.150755672289
  (0, 25337)	0.0753778361444
  (0, 25361)	0.0753778361444
  :	:
  (2256, 6430)	0.0721687836487
  (2256, 24052)	0.0721687836487
  (2256, 22270)	0.0721687836487
  (2256, 35638)	0.144337567297
  (2256, 32233)	0.0721687836487
  (2256, 35157)	0.0721687836487
  (2256, 4938)	0.0721687836487
  (2256, 34923)	0.072168

In [55]:
X_train_tf.vocabulary

AttributeError: vocabulary not found

In [36]:
#Note that the above process can be a single process
# X_train_tf_2 = TfidfTransformer(use_idf=False).fit_transform(X_train_counts)
# X_train_tfidf = X_train_tf

In [37]:
#Check to see if fit_transform = fit + transform
#print(X_train_tf)
#print(X_train_tf_2)

# Training a classifier

In [49]:
#As you can see, the example trains the model with the target simultaneously
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target)