### Fit and transform separately

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['I am starting a new deep learning course today',
             'Deep learning is very interesting',
             'I really need some deep sleep'
    ]

vec = CountVectorizer()
vec = vec.fit(text_data)

data_transformed = vec.transform(text_data)

df = DataFrame(data_transformed.toarray(), columns = vec.vocabulary_)
df

Unnamed: 0,am,starting,new,deep,learning,course,today,is,very,interesting,really,need,some,sleep
0,1,1,1,0,0,1,0,1,0,0,0,1,1,0
1,0,0,1,1,1,1,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,1,0,1,1,1,0,0,0


### Fit and transform combined

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['I am starting a new deep learning course today',
             'Deep learning is very interesting',
             'I really need some deep sleep'
    ]

vec = CountVectorizer()

data_transformed = vec.fit_transform(text_data)

df = DataFrame(data_transformed.toarray(), columns = vec.vocabulary_)
df

Unnamed: 0,am,starting,new,deep,learning,course,today,is,very,interesting,really,need,some,sleep
0,1,1,1,0,0,1,0,1,0,0,0,1,1,0
1,0,0,1,1,1,1,0,0,0,0,0,0,0,1
2,0,0,1,0,0,0,1,0,1,1,1,0,0,0


### Dealing with stopwords

In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['I am starting a new deep learning course today',
             'Deep learning is very interesting',
             'I really need some deep sleep'
    ]

vec = CountVectorizer(stop_words = 'english')

data_transformed = vec.fit_transform(text_data)

df = DataFrame(data_transformed.toarray(), columns = vec.vocabulary_)
df

Unnamed: 0,starting,new,deep,learning,course,today,interesting,really,need,sleep
0,1,1,0,1,0,1,0,0,1,1
1,0,1,1,1,0,0,0,0,0,0
2,0,1,0,0,1,0,1,1,0,0


### Dealing with N-gram

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['I am starting a new deep learning course today',
             'Deep learning is very interesting',
             'I really need some deep sleep'
    ]

vec = CountVectorizer(ngram_range = (1,3))

data_transformed = vec.fit_transform(text_data)

df = DataFrame(data_transformed.toarray(), columns = vec.vocabulary_)
df

Unnamed: 0,am,starting,new,deep,learning,course,today,am starting,starting new,new deep,...,need,some,sleep,really need,need some,some deep,deep sleep,really need some,need some deep,some deep sleep
0,1,1,1,1,1,1,1,1,0,0,...,0,0,0,0,1,1,1,1,0,0
1,0,0,0,0,0,1,1,0,1,0,...,0,0,0,0,0,0,0,0,1,1
2,0,0,0,0,0,1,0,0,0,1,...,1,1,1,1,0,0,0,0,0,0


### Count vectorizer with data preprocessing

In [4]:
from sklearn.feature_extraction.text import CountVectorizer
from pandas import DataFrame

text_data = ['I am starting a new deep learning course today',
             'Deep learning is very interesting',
             'I really need some deep sleep'
    ]

vec = CountVectorizer(ngram_range = (2,2), stop_words = 'english', lowercase = True)

data_transformed = vec.fit_transform(text_data)

df = DataFrame(data_transformed.toarray(), columns = vec.vocabulary_)
df

Unnamed: 0,starting new,new deep,deep learning,learning course,course today,learning interesting,really need,need deep,deep sleep
0,1,1,0,1,0,0,1,0,1
1,0,1,0,0,1,0,0,0,0
2,0,0,1,0,0,1,0,1,0
