# Bag of words ( Document Matrix )

In [1]:
import nltk
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

In [2]:
paragraph ="Gross domestic products GDP is a monetary measure of the market value of all the final goods and services produced in a specific time period, often annually. GDP nominal per capita does not, however, reflect differences in the cost of living and the inflation rates of the countries; therefore using a basis of GDP per capita at purchasing power parity PPP is arguably more useful when comparing differences in living standards between nations."

In [3]:
ps = PorterStemmer()

In [4]:
wordnet = WordNetLemmatizer()

In [5]:
sentences = nltk.sent_tokenize(paragraph)

In [6]:
sentences

['Gross domestic products GDP is a monetary measure of the market value of all the final goods and services produced in a specific time period, often annually.',
 'GDP nominal per capita does not, however, reflect differences in the cost of living and the inflation rates of the countries; therefore using a basis of GDP per capita at purchasing power parity PPP is arguably more useful when comparing differences in living standards between nations.']

In [7]:
# with Stemming

In [8]:
corpus1 = []

In [9]:
for i in range(len(sentences)):
    review = re.sub('[^a-zA-Z]',' ',sentences[i])
    review = review.lower()
    review = review.split()
    review = [ps.stem(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus1.append(review)
    
    

In [10]:
corpus1

[u'gross domest product gdp monetari measur market valu final good servic produc specif time period often annual',
 u'gdp nomin per capita howev reflect differ cost live inflat rate countri therefor use basi gdp per capita purchas power pariti ppp arguabl use compar differ live standard nation']

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
cv = CountVectorizer(max_features = 20)
X = cv.fit_transform(corpus1).toarray()


In [13]:
X

array([[1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0],
       [0, 2, 2, 2, 2, 2, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 2]], dtype=int64)

In [14]:
# with Lemmatization


In [15]:
corpus2 = []

In [16]:
for j in range(len(sentences)):
    review1 = re.sub('[^a-zA-Z]',' ',sentences[j])
    review1 = review1.lower()
    review1 = review1.split()
    review1 = [wordnet.lemmatize(word) for word in review1 if word not in set(stopwords.words('english'))]
    review1 = ' '.join(review1)
    corpus2.append(review1)

In [17]:
corpus2

[u'gross domestic product gdp monetary measure market value final good service produced specific time period often annually',
 u'gdp nominal per caput however reflect difference cost living inflation rate country therefore using basis gdp per caput purchasing power parity ppp arguably useful comparing difference living standard nation']

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
cv1 = CountVectorizer(max_features = 20)
Y = cv1.fit_transform(corpus2).toarray()


In [20]:
Y

array([[1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0],
       [0, 2, 2, 2, 2, 2, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1]], dtype=int64)

In [21]:
# TF-IDF 

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
TFidf = TfidfVectorizer(max_features = 20)
TfidfMatrix = TFidf.fit_transform(corpus2).toarray()


In [24]:
TfidfMatrix

array([[ 0.36499647,  0.        ,  0.        ,  0.25969799,  0.        ,
         0.        ,  0.36499647,  0.        ,  0.        ,  0.36499647,
         0.36499647,  0.        ,  0.        ,  0.        ,  0.36499647,
         0.36499647,  0.        ,  0.        ,  0.36499647,  0.        ],
       [ 0.        ,  0.39204401,  0.39204401,  0.27894255,  0.39204401,
         0.39204401,  0.        ,  0.19602201,  0.19602201,  0.        ,
         0.        ,  0.19602201,  0.19602201,  0.19602201,  0.        ,
         0.        ,  0.19602201,  0.19602201,  0.        ,  0.19602201]])

In [25]:
import pandas as pd

In [26]:
Data = pd.DataFrame(TfidfMatrix)

In [27]:
Data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.364996,0.0,0.0,0.259698,0.0,0.0,0.364996,0.0,0.0,0.364996,0.364996,0.0,0.0,0.0,0.364996,0.364996,0.0,0.0,0.364996,0.0
1,0.0,0.392044,0.392044,0.278943,0.392044,0.392044,0.0,0.196022,0.196022,0.0,0.0,0.196022,0.196022,0.196022,0.0,0.0,0.196022,0.196022,0.0,0.196022
