**Download necessary files**

In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/subashgandyer/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

**Import the necessary libraries**

In [2]:
import numpy as np
import random
import string

**Import libraries for scraping - beautifulSoup**

In [3]:
import bs4 as bs
import urllib.request
import re

**Read the web link**

In [4]:
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Natural_language_processing')
raw_html = raw_html.read()

**Convert the web link contents into a file**

In [5]:
article_html = bs.BeautifulSoup(raw_html, 'lxml')

**Find only the html text containing inside < p > tags**

In [6]:
article_paragraphs = article_html.find_all('p')

**Read all the text inside paragraphs**

In [7]:
article_text = ''

for para in article_paragraphs:
    article_text += para.text
    
article_text

'Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.  The result is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them. The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves. \nChallenges in natural language processing frequently involve speech recognition, natural language understanding, and natural-language generation.\nNatural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, a task that involves the automated i

**Split the sentences**

In [8]:
corpus = nltk.sent_tokenize(article_text)
corpus

['Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence concerned with the interactions between computers and human language, in particular how to program computers to process and analyze large amounts of natural language data.',
 'The result is a computer capable of "understanding" the contents of documents, including the contextual nuances of the language within them.',
 'The technology can then accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.',
 'Challenges in natural language processing frequently involve speech recognition, natural language understanding, and natural-language generation.',
 'Natural language processing has its roots in the 1950s.',
 'Already in 1950, Alan Turing published an article titled "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence, a task that involve

**Clean the text**

In [9]:
for i in range(len(corpus )):
    corpus [i] = corpus [i].lower()
    corpus [i] = re.sub(r'\W',' ',corpus [i])
    corpus [i] = re.sub(r'\s+',' ',corpus [i])

**Create a dictionary of word frequency**

In [10]:
wordfreq = {}
for sentence in corpus:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1

**Filter the top 200 words**

In [11]:
import heapq
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

**Compute IDF values**

In [12]:
word_idf_values = {}
for token in most_freq:
    doc_containing_word = 0
    for document in corpus:
        if token in nltk.word_tokenize(document):
            doc_containing_word += 1
    word_idf_values[token] = np.log(len(corpus)/(1 + doc_containing_word))

**Compute TF values**

In [13]:
word_tf_values = {}
for token in most_freq:
    sent_tf_vector = []
    for document in corpus:
        doc_freq = 0
        for word in nltk.word_tokenize(document):
            if token == word:
                  doc_freq += 1
        word_tf = doc_freq/len(nltk.word_tokenize(document))
        sent_tf_vector.append(word_tf)
    word_tf_values[token] = sent_tf_vector

**Compute TF-IDF values**

In [14]:
tfidf_values = []
for token in word_tf_values.keys():
    tfidf_sentences = []
    for tf_sentence in word_tf_values[token]:
        tf_idf_score = tf_sentence * word_idf_values[token]
        tfidf_sentences.append(tf_idf_score)
    tfidf_values.append(tfidf_sentences)

In [15]:
tf_idf_model = np.asarray(tfidf_values)

In [16]:
tf_idf_model = np.transpose(tf_idf_model)

In [17]:
tf_idf_model

array([[0.00830223, 0.01511793, 0.0583927 , ..., 0.        , 0.        ,
        0.        ],
       [0.06167373, 0.04211422, 0.03614786, ..., 0.        , 0.        ,
        0.        ],
       [0.04415278, 0.        , 0.        , ..., 0.14350002, 0.14350002,
        0.14350002],
       ...,
       [0.0140777 , 0.02563474, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.01734115, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [18]:
import pandas as pd
pd.DataFrame(tf_idf_model, columns=most_freq)

Unnamed: 0,the,of,language,to,and,in,a,natural,processing,nlp,...,capable,contents,including,contextual,nuances,within,them,technology,accurately,extract
0,0.008302,0.015118,0.058393,0.034466,0.065724,0.021908,0.023223,0.05215,0.02763,0.031054,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.061674,0.042114,0.036148,0.0,0.0,0.0,0.043129,0.0,0.0,0.0,...,0.150333,0.150333,0.150333,0.150333,0.150333,0.150333,0.150333,0.0,0.0,0.0
2,0.044153,0.0,0.0,0.0,0.077674,0.038837,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.1435,0.1435,0.1435
3,0.0,0.0,0.142332,0.0,0.053401,0.053401,0.0,0.190675,0.067347,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.035976,0.0,0.084345,0.0,0.0,0.094935,0.0,0.112993,0.119729,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.01868,0.011338,0.014598,0.0,0.032862,0.016431,0.052252,0.019556,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.019046,0.011561,0.014884,0.013178,0.016753,0.0,0.035518,0.01994,0.0,0.047494,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.017988,0.016378,0.042173,0.037339,0.0,0.0,0.0,0.056496,0.059864,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.026982,0.012283,0.063259,0.0,0.0,0.071201,0.037738,0.042372,0.089797,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.033495,0.030497,0.0,0.023176,0.029463,0.029463,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
