### N-gram using NLTK

In [2]:
import nltk
from nltk.util import ngrams
 
# Function to generate n-grams from sentences.
def extract_ngrams(data, num):
    n_grams = ngrams(nltk.word_tokenize(data), num)
    return [ ' '.join(grams) for grams in n_grams]
 
data = 'A class is a blueprint for the object.'
 
print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data, 3))
print("4-gram: ", extract_ngrams(data, 4))

1-gram:  ['A', 'class', 'is', 'a', 'blueprint', 'for', 'the', 'object', '.']
2-gram:  ['A class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object', 'object .']
3-gram:  ['A class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object', 'the object .']
4-gram:  ['A class is a', 'class is a blueprint', 'is a blueprint for', 'a blueprint for the', 'blueprint for the object', 'for the object .']


### N-gram using TextBlob

In [3]:
from textblob import TextBlob
 
# Function to generate n-grams from sentences.
def extract_ngrams(data, num):
    n_grams = TextBlob(data).ngrams(num)
    return [ ' '.join(grams) for grams in n_grams]
 
data = 'A class is a blueprint for the object.'
 
print("1-gram: ", extract_ngrams(data, 1))
print("2-gram: ", extract_ngrams(data, 2))
print("3-gram: ", extract_ngrams(data, 3))
print("4-gram: ", extract_ngrams(data, 4))

1-gram:  ['A', 'class', 'is', 'a', 'blueprint', 'for', 'the', 'object']
2-gram:  ['A class', 'class is', 'is a', 'a blueprint', 'blueprint for', 'for the', 'the object']
3-gram:  ['A class is', 'class is a', 'is a blueprint', 'a blueprint for', 'blueprint for the', 'for the object']
4-gram:  ['A class is a', 'class is a blueprint', 'is a blueprint for', 'a blueprint for the', 'blueprint for the object']


### CountVectorizer

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
df = pd.read_csv('nutrition.csv')
df

Unnamed: 0,product_name,product_description
0,Whey Protein Isolate 90,What is Whey Protein Isolate? Whey Protein Iso...
1,Whey Protein 80,What is Whey Protein 80? Whey Protein 80 is an...
2,Volt Preworkout™,What is Volt™? Our Volt pre workout formula in...


### Raw Information

In [15]:
text = df['product_name']
model = CountVectorizer(ngram_range = (1, 1))
matrix = model.fit_transform(text).toarray()
df_output = pd.DataFrame(data = matrix, columns = model.get_feature_names())
df_output.T.tail(5)



Unnamed: 0,0,1,2
isolate,1,0,0
preworkout,0,0,1
protein,1,1,0
volt,0,0,1
whey,1,1,0


In [9]:
df_output

Unnamed: 0,100,11,12,14,1400,150,17g,19,1g,20,...,work,worked,working,workout,world,would,you,your,zinc,zma
0,0,1,0,0,0,1,2,1,1,1,...,0,0,0,2,0,0,10,8,0,0
1,2,0,0,1,0,0,0,0,1,1,...,2,0,0,4,2,0,7,6,0,0
2,0,0,2,0,1,0,0,0,1,0,...,3,1,3,18,0,1,7,7,1,1


### Remove stop words

In [10]:
text = df['product_description']
model = CountVectorizer(ngram_range = (1, 1), stop_words='english')
matrix = model.fit_transform(text).toarray()
df_output = pd.DataFrame(data = matrix, columns = model.get_feature_names())
df_output.T.tail(5)



Unnamed: 0,0,1,2
working,0,0,3
workout,2,4,18
world,0,2,0
zinc,0,0,1
zma,0,0,1


In [11]:
df_output

Unnamed: 0,100,11,12,14,1400,150,17g,19,1g,20,...,whilst,whipped,women,work,worked,working,workout,world,zinc,zma
0,0,1,0,0,0,1,2,1,1,1,...,0,1,0,0,0,0,2,0,0,0
1,2,0,0,1,0,0,0,0,1,1,...,1,0,0,2,0,0,4,2,0,0
2,0,0,2,0,1,0,0,0,1,0,...,1,0,1,3,1,3,18,0,1,1


### Bi-gram

In [17]:
text = df['product_name']
model = CountVectorizer(ngram_range = (2, 2), stop_words='english')
matrix = model.fit_transform(text).toarray()
df_output = pd.DataFrame(data = matrix, columns = model.get_feature_names())
df_output.T



Unnamed: 0,0,1,2
isolate 90,1,0,0
protein 80,0,1,0
protein isolate,1,0,0
volt preworkout,0,0,1
whey protein,1,1,0
