## N-grams: Is a set of continuous sequence of n items from a given sequence of large text.

In [1]:
import nltk
from nltk.util import ngrams
from collections import Counter

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/rahul.b.sarkar/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
#Creating a function to extract n-grams from text
def get_ngrams(txt, n):
    n_grams = ngrams(nltk.word_tokenize(txt),n)
    return [' '.join(grams) for grams in n_grams]

In [4]:
txt = 'This is a course of Natural Language Processing'

In [7]:
print("1-gram:", get_ngrams(txt, 1))
print("2-gram:", get_ngrams(txt, 2))
print("3-gram:", get_ngrams(txt, 3))
print("4-gram:", get_ngrams(txt, 4))

1-gram: ['This', 'is', 'a', 'course', 'of', 'Natural', 'Language', 'Processing']
2-gram: ['This is', 'is a', 'a course', 'course of', 'of Natural', 'Natural Language', 'Language Processing']
3-gram: ['This is a', 'is a course', 'a course of', 'course of Natural', 'of Natural Language', 'Natural Language Processing']
4-gram: ['This is a course', 'is a course of', 'a course of Natural', 'course of Natural Language', 'of Natural Language Processing']


### Note: 1-gram is called unigram, 2-gram, and 3-gram as bigram and trigram respectively.

In [8]:
import string

def remove_punctuations(text):
    words = nltk.word_tokenize(text)
    punt_removed = [w for w in words if w.lower() not in string.punctuation]
    return " ".join(punt_removed)

In [9]:
text = "Python skills, and Numpy skills are equally important for data analysis. Python skills, and linear algebra are important for machine learning algorithms"

In [10]:
text = remove_punctuations(text)

In [11]:
#Extracting bigrams
bigrams = get_ngrams(text, 2)

In [12]:
#Counting bigrams
bigrams_count = Counter(bigrams)

In [13]:
import pandas as pd
df = pd.DataFrame.from_dict(bigrams_count, orient='index')

In [14]:
# Renaming index and column name
df = df.rename(columns={'index':'words', 0:'frequency'})

In [15]:
print(df)

                     frequency
Python skills                2
skills and                   2
and Numpy                    1
Numpy skills                 1
skills are                   1
are equally                  1
equally important            1
important for                2
for data                     1
data analysis                1
analysis Python              1
and linear                   1
linear algebra               1
algebra are                  1
are important                1
for machine                  1
machine learning             1
learning algorithms          1


## Reference:

http://www.nltk.org/api/nltk.html#nltk.util.ngrams