# <center> NLP Lab 3 </center>

## Importing tfidf from sklearn

In [1]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

### Fitting and transforming a list of documents (in this case a list of sentences)

In [2]:
v = CountVectorizer()

sent = ["The sky is blue.", "The sun is bright today.", \
        "The sun in the sky is bright.", "We can see the shining sun, the bright sun."]
v.fit(sent)

transformed = v.transform(sent)

print(v.vocabulary_, '\nWord sky has id %d' % v.vocabulary_['sky'])

{'the': 9, 'sky': 7, 'is': 4, 'blue': 0, 'sun': 8, 'bright': 1, 'today': 10, 'in': 3, 'we': 11, 'can': 2, 'see': 5, 'shining': 6} 
Word sky has id 7


### Printing the vocabulary of the fitted vectorizer and the term frequency matrix

In [3]:
#Getting the vocabulary dictionary and sorting it based on value (word id)
print(sorted(v.vocabulary_.items(), key=lambda x: x[1])) 

print(transformed.toarray()) # The term frequency matrix for the four sentences

[('blue', 0), ('bright', 1), ('can', 2), ('in', 3), ('is', 4), ('see', 5), ('shining', 6), ('sky', 7), ('sun', 8), ('the', 9), ('today', 10), ('we', 11)]
[[1 0 0 0 1 0 0 1 0 1 0 0]
 [0 1 0 0 1 0 0 0 1 1 1 0]
 [0 1 0 1 1 0 0 1 1 2 0 0]
 [0 1 1 0 0 1 1 0 2 2 0 1]]


### The sparse matrix, a tuple of (doc_id, word_id) and the corresponding word count

In [4]:
print(transformed)

  (0, 0)	1
  (0, 4)	1
  (0, 7)	1
  (0, 9)	1
  (1, 1)	1
  (1, 4)	1
  (1, 8)	1
  (1, 9)	1
  (1, 10)	1
  (2, 1)	1
  (2, 3)	1
  (2, 4)	1
  (2, 7)	1
  (2, 8)	1
  (2, 9)	2
  (3, 1)	1
  (3, 2)	1
  (3, 5)	1
  (3, 6)	1
  (3, 8)	2
  (3, 9)	2
  (3, 11)	1


### Transforming a new sentence using the same vectorizer (will maintain word ids)

In [5]:
sent2 = ['The moon is bright today']

print(v.transform(sent2))

  (0, 1)	1
  (0, 4)	1
  (0, 9)	1
  (0, 10)	1


### The word moon doesn't appear in original vocabulary, so will not be found even after transforming new sentence

In [6]:
print('moon' in v.vocabulary_)

False


### Generating the Tfidf Matrix

In [7]:

vv = TfidfVectorizer(norm = None)
tfidf = vv.fit_transform(sent)
print(sorted(vv.vocabulary_.items(), key=lambda x : x[1]))

for row in tfidf.toarray():
    print(["%.4f"% val for val in row])

[('blue', 0), ('bright', 1), ('can', 2), ('in', 3), ('is', 4), ('see', 5), ('shining', 6), ('sky', 7), ('sun', 8), ('the', 9), ('today', 10), ('we', 11)]
['1.9163', '0.0000', '0.0000', '0.0000', '1.2231', '0.0000', '0.0000', '1.5108', '0.0000', '1.0000', '0.0000', '0.0000']
['0.0000', '1.2231', '0.0000', '0.0000', '1.2231', '0.0000', '0.0000', '0.0000', '1.2231', '1.0000', '1.9163', '0.0000']
['0.0000', '1.2231', '0.0000', '1.9163', '1.2231', '0.0000', '0.0000', '1.5108', '1.2231', '2.0000', '0.0000', '0.0000']
['0.0000', '1.2231', '1.9163', '0.0000', '0.0000', '1.9163', '1.9163', '0.0000', '2.4463', '2.0000', '0.0000', '1.9163']


## (Optional Task) Build the TF-IDF Matrix for the gutenbreg corpus

In [8]:
### Write code Here