## Bag of words

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import nltk

In [4]:
doc1='Game of Thrones is an amazing tv series!' 
doc2='Game of Thrones is the best tv series!' 
doc3='Game of Thrones is so great' 

vectorizer=CountVectorizer() 
x=vectorizer.fit_transform([doc1,doc2,doc3]) 
vectorizer.get_feature_names()



['amazing',
 'an',
 'best',
 'game',
 'great',
 'is',
 'of',
 'series',
 'so',
 'the',
 'thrones',
 'tv']

In [5]:
x.toarray()

array([[1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1],
       [0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1],
       [0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0]], dtype=int64)

In [6]:
bow_df=pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())
bow_df

Unnamed: 0,amazing,an,best,game,great,is,of,series,so,the,thrones,tv
0,1,1,0,1,0,1,1,1,0,0,1,1
1,0,0,1,1,0,1,1,1,0,1,1,1
2,0,0,0,1,1,1,1,0,1,0,1,0


In [7]:
#remove stopwords also
vectorizer=CountVectorizer(stop_words='english')
x=vectorizer.fit_transform([doc1,doc2,doc3])
pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())



Unnamed: 0,amazing,best,game,great,series,thrones,tv
0,1,0,1,0,1,1,1
1,0,1,1,0,1,1,1
2,0,0,1,1,0,1,0


In [8]:
#using ngrams
vectorizer=CountVectorizer(ngram_range=(2,2))
x=vectorizer.fit_transform([doc1,doc2,doc3])
pd.DataFrame(x.toarray(),columns=vectorizer.get_feature_names())



Unnamed: 0,amazing tv,an amazing,best tv,game of,is an,is so,is the,of thrones,so great,the best,thrones is,tv series
0,1,1,0,1,1,0,0,1,0,0,1,1
1,0,0,1,1,0,0,1,1,0,1,1,1
2,0,0,0,1,0,1,0,1,1,0,1,0


## TF-IDF(term frequency-inverse document frequency)

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

### example 1

In [12]:
docs=["the house had a tiny little mouse",
      "the cat saw the mouse",
     "the mouse ran away from the house",
     "the cat finally ate the mouse",
     "the end of the mouse story"]

In [13]:
vectorizer=TfidfVectorizer()
sentence_vectors=vectorizer.fit_transform(docs)

In [14]:
vectorizer.get_feature_names_out()

array(['ate', 'away', 'cat', 'end', 'finally', 'from', 'had', 'house',
       'little', 'mouse', 'of', 'ran', 'saw', 'story', 'the', 'tiny'],
      dtype=object)

In [15]:
sentence_vectors.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.49356209, 0.39820278, 0.49356209, 0.23518498,
        0.        , 0.        , 0.        , 0.        , 0.23518498,
        0.49356209],
       [0.        , 0.        , 0.48334378, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.28547062,
        0.        , 0.        , 0.59909216, 0.        , 0.57094124,
        0.        ],
       [0.        , 0.45709287, 0.        , 0.        , 0.        ,
        0.45709287, 0.        , 0.36877965, 0.        , 0.2178072 ,
        0.        , 0.45709287, 0.        , 0.        , 0.43561441,
        0.        ],
       [0.51392301, 0.        , 0.41462985, 0.        , 0.51392301,
        0.        , 0.        , 0.        , 0.        , 0.24488707,
        0.        , 0.        , 0.        , 0.        , 0.48977413,
        0.        ],
       [0.        , 0.        , 0.        , 0.49175319, 0.        ,
        0.        , 0.        , 

In [19]:
#get the first vector out(for the first sentence)
first_vector=sentence_vectors[0]
#place tf-idf values in a pandas data frame
df=pd.DataFrame(first_vector.T.toarray(),index=vectorizer.get_feature_names_out(),columns=["tfidf"])
df.sort_values(by=["tfidf"],ascending=False)

Unnamed: 0,tfidf
had,0.493562
little,0.493562
tiny,0.493562
house,0.398203
mouse,0.235185
the,0.235185
ate,0.0
away,0.0
cat,0.0
end,0.0


In [22]:
pd.DataFrame(sentence_vectors.toarray(),columns=vectorizer.get_feature_names_out())

Unnamed: 0,ate,away,cat,end,finally,from,had,house,little,mouse,of,ran,saw,story,the,tiny
0,0.0,0.0,0.0,0.0,0.0,0.0,0.493562,0.398203,0.493562,0.235185,0.0,0.0,0.0,0.0,0.235185,0.493562
1,0.0,0.0,0.483344,0.0,0.0,0.0,0.0,0.0,0.0,0.285471,0.0,0.0,0.599092,0.0,0.570941,0.0
2,0.0,0.457093,0.0,0.0,0.0,0.457093,0.0,0.36878,0.0,0.217807,0.0,0.457093,0.0,0.0,0.435614,0.0
3,0.513923,0.0,0.41463,0.0,0.513923,0.0,0.0,0.0,0.0,0.244887,0.0,0.0,0.0,0.0,0.489774,0.0
4,0.0,0.0,0.0,0.491753,0.0,0.0,0.0,0.0,0.0,0.234323,0.491753,0.0,0.0,0.491753,0.468646,0.0


##  Word2Vec

In [27]:
pip install gensim

Collecting gensim
  Downloading gensim-4.2.0-cp39-cp39-win_amd64.whl (23.9 MB)
Collecting smart-open>=1.8.1
  Downloading smart_open-6.2.0-py3-none-any.whl (58 kB)
Collecting Cython==0.29.28
  Downloading Cython-0.29.28-py2.py3-none-any.whl (983 kB)
Installing collected packages: smart-open, Cython, gensim
  Attempting uninstall: Cython
    Found existing installation: Cython 0.29.24
    Uninstalling Cython-0.29.24:
      Successfully uninstalled Cython-0.29.24
Successfully installed Cython-0.29.28 gensim-4.2.0 smart-open-6.2.0
Note: you may need to restart the kernel to use updated packages.


In [28]:
import gensim
from gensim.models import Word2Vec
from nltk.corpus import brown,movie_reviews,treebank

In [32]:
#nltk.download('brown')

[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\brown.zip.


True

In [35]:
#nltk.download('movie_reviews')

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\movie_reviews.zip.


True

In [36]:
#nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\Lenovo\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\treebank.zip.


True

In [None]:
brown_corpus=Word2Vec(brown.sents()).wv
movie_review_corpus=Word2Vec(movie_reviews.sents()).wv
tree_corpus=Word2Vec(treebank.sents()).wv

In [None]:
brown_corpus.index_to_key  #get the vocabulary

In [None]:
movi