In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###### Importing Libraries and getting list commonly used English words called stopwords and creating a new Porter stemmer to stem the words.

In [None]:
import nltk
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string
import pandas as pd
import math
import requests
from bs4 import BeautifulSoup

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
nltk.download('stopwords')
nltk.download('punkt')
stop_words = set(stopwords.words('english'))
porter = PorterStemmer()

###### Ans 1.(i)
###### Getting contents of text files using file.read()

In [None]:
texts = []
for i in range(1, 11):
    file = open(f"/content/drive/My Drive/ColabFiles/Text{i}.txt", "r")
    texts.append(file.read())
    file.close()

###### Ans 1.(ii)
###### Getting contents of texts file using PlaintextCorpusReader()

In [None]:
texts = []
for i in range(1, 11):
    file = PlaintextCorpusReader(f"/content/drive/My Drive/ColabFiles/", "Text\d{1,2}.txt")
    texts.append(file.raw("Text" + str(i) + ".txt"))

###### Ans 2.
###### Function to change character of text string to lowercase and then getting lists of sentence tokens and also word tokens by stopwords removal and stemming.

In [None]:
def preprocess(s):
    s = s.lower()
    sentence_tokens = sent_tokenize(s)
    word_tokens = []
    for x in sentence_tokens:
        word_tokens.append([porter.stem(i) for i in word_tokenize(x) if i not in string.punctuation and i not in stop_words and i.isalpha()])
    return sentence_tokens, word_tokens

###### Ans 3.(i)
###### Function to convert the corpus into Bag-of-Words and tf-idf feature matrix using TfidfVectorizer() and CountVectorizer().

In [None]:
class Tokenizer:
    def __call__(self, s):
        return [porter.stem(i) for i in word_tokenize(s) if i not in string.punctuation and i not in stop_words and i.isalpha()]
def autoVectorizer(sentence_tokens):
    vectorizer = CountVectorizer(lowercase = False, tokenizer = Tokenizer())
    count = vectorizer.fit_transform(sentence_tokens).toarray().tolist()
    print("Bag of words using Built in function")
    print(pd.DataFrame(count, index = [i + 1 for i in range(len(sentence_tokens))], columns = vectorizer.get_feature_names_out()))
    vectorizer2 = TfidfVectorizer(norm = None, smooth_idf = False, lowercase = False, tokenizer = Tokenizer())
    tfidf = vectorizer2.fit_transform(sentence_tokens).toarray().tolist()
    print("TF-IDF using Built in function")
    print(pd.DataFrame(tfidf, index = [i + 1 for i in range(len(sentence_tokens))], columns = vectorizer2.get_feature_names_out()))

###### Ans 3.(ii)
###### Function to convert the corpus into Bag-of-Words and tf-idf feature matrix without using in-built functions.

In [None]:
def manualVectorizer(sentence_tokens, word_tokens):
    distinct_word_tokens = set(sum(word_tokens, []))
    count = pd.DataFrame([[0] * len(distinct_word_tokens)] * len(sentence_tokens), index = [i + 1 for i in range(len(sentence_tokens))], columns = list(distinct_word_tokens))
    tfidf = pd.DataFrame([[0] * len(distinct_word_tokens)] * len(sentence_tokens), index = [i + 1 for i in range(len(sentence_tokens))], columns = list(distinct_word_tokens)).astype("float64")
    for i in range(len(word_tokens)):
        for j in range(len(word_tokens[i])):
            count.at[i + 1, word_tokens[i][j]] += 1
    print("Bag of words using Built in function")
    print(count)
    for i in range(len(word_tokens)):
        for j in range(len(word_tokens[i])):
            tfidf.at[i + 1, word_tokens[i][j]] += math.log(len(word_tokens) / (count.loc[:, word_tokens[i][j]] != 0).sum()) + 1
    print("TF-IDF using Built in function")
    print(tfidf)

###### Performing preprocessing and vectorization on text strings obtained from 10 text files.

In [None]:
for i in range(10):
    print(f"Text File {i+1}:")
    sentence_tokens, word_tokens = preprocess(texts[i])
    autoVectorizer(sentence_tokens)

Text File 1:
Bag of words using Built in function
    acquisit  ad  advanc  agent  alreadi  also  among  amount  appeal  appear  \
1          0   0       0      0        0     0      0       0       0       0   
2          0   0       0      0        0     0      0       0       0       0   
3          0   0       0      0        0     0      0       0       0       0   
4          0   0       0      0        1     0      0       0       0       0   
5          0   0       0      0        0     0      0       0       0       0   
6          0   0       0      0        0     0      0       0       0       0   
7          0   0       0      0        0     0      0       0       0       0   
8          0   0       0      0        0     0      0       0       0       0   
9          0   0       0      0        0     0      0       0       0       0   
10         0   0       0      0        0     0      0       0       1       0   
11         1   0       0      0        0     0      0      



Bag of words using Built in function
    across  ad  also  antonio  back  bar  becam  belli  block  bodi  ...  \
1        0   0     0        0     0    0      0      0      0     1  ...   
2        0   0     0        0     0    0      0      0      0     0  ...   
3        0   0     0        0     0    0      0      0      0     0  ...   
4        0   0     0        0     0    0      0      1      0     0  ...   
5        0   0     0        0     0    0      0      1      0     0  ...   
6        0   0     0        0     0    0      0      0      0     0  ...   
7        0   0     0        0     0    0      0      0      0     0  ...   
8        0   2     0        0     0    0      0      0      0     0  ...   
9        0   0     0        0     0    0      0      0      0     0  ...   
10       0   0     0        0     0    0      0      0      0     0  ...   
11       0   0     0        0     0    0      0      0      0     0  ...   
12       0   0     0        0     0    0      0    



TF-IDF using Built in function
       adapt    almost      also    appear     appli      aris    arrang  \
1   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
2   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  3.564949   
3   0.000000  0.000000  0.000000  0.000000  3.564949  0.000000  0.000000   
4   3.564949  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
5   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
6   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
7   0.000000  0.000000  0.000000  3.564949  0.000000  3.564949  0.000000   
8   0.000000  3.564949  0.000000  0.000000  0.000000  0.000000  0.000000   
9   0.000000  0.000000  3.564949  0.000000  0.000000  0.000000  0.000000   
10  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
11  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
12  0.000000  0.000000  0.000000  0.000000  0.000000  0.0



    aberr  absenc  accompani  accomplish  advanc  albert  amaz  appear  \
1       0       0          0           0       0       0     0       0   
2       0       0          0           0       0       0     0       0   
3       0       0          0           0       0       0     0       0   
4       0       0          0           0       0       0     0       0   
5       0       0          0           0       0       0     0       0   
6       0       0          0           0       0       0     0       0   
7       0       0          0           0       0       0     0       0   
8       0       0          0           0       0       1     0       0   
9       1       0          1           0       1       0     1       0   
10      0       0          0           0       0       0     0       0   
11      0       1          0           1       0       0     0       0   
12      0       0          0           0       0       0     0       0   
13      0       0          0          



       abund    actual    almost      also  altogeth    amount     anoth  \
1   0.000000  0.000000  0.000000  3.079442  0.000000  0.000000  0.000000   
2   3.772589  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
3   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
4   0.000000  0.000000  0.000000  3.079442  0.000000  0.000000  0.000000   
5   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
6   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
7   0.000000  0.000000  0.000000  0.000000  0.000000  3.772589  0.000000   
8   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
9   0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  3.772589   
10  0.000000  0.000000  3.772589  0.000000  0.000000  0.000000  0.000000   
11  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000   
12  0.000000  0.000000  0.000000  0.000000  3.772589  0.000000  0.000000   
13  0.000000



    access  addit  address  advanc  allow  also  android  appl  applic  \
1        0      0        1       0      1     1        0     0       1   
2        0      0        0       0      0     0        0     0       0   
3        1      0        1       0      0     0        0     0       0   
4        1      0        0       0      0     0        0     0       0   
5        0      0        0       1      0     0        0     0       0   
6        0      0        0       0      0     0        0     0       0   
7        0      0        0       0      0     0        1     0       0   
8        0      0        0       0      0     0        0     1       0   
9        0      0        0       0      0     0        0     0       0   
10       0      0        0       0      0     0        0     0       0   
11       0      0        0       0      0     0        0     0       0   
12       2      1        0       0      0     0        0     0       0   
13       0      0        0       0    



###### Ans 4.
###### Accessing HTML texts using beautifulsoup4 package. The pre-processing and creation of feature vectors for text obtained can done using two methods written above.

In [None]:
url = "https://britannica.com"
result = requests.get(url)
content = result.content
soup = BeautifulSoup(content, "lxml")
divs = soup.find_all("div")
text = [x.text for x in divs if "class" in x.attrs and "font-serif" in x["class"]]
s = " ".join(text)
print("HTML texts:")
sentence_tokens, word_tokens = preprocess(s)
manualVectorizer(sentence_tokens, word_tokens)

HTML texts:
Bag of words using Built in function
    form  hero  rank  go  languag  retir  helper  grew  surpris  greta  ...  \
1      0     0     0   0        0      0       0     0        0      0  ...   
2      0     0     0   0        0      0       0     0        0      0  ...   
3      0     0     0   0        0      0       0     0        0      0  ...   
4      0     0     0   0        0      0       0     0        0      0  ...   
5      0     0     0   0        0      0       0     0        0      0  ...   
6      0     0     0   0        0      0       0     0        0      0  ...   
7      0     0     0   0        0      0       0     1        0      1  ...   
8      1     0     0   0        0      0       0     0        0      0  ...   
9      0     0     1   0        0      0       0     0        1      0  ...   
10     0     0     0   0        0      0       0     0        0      0  ...   
11     0     0     0   0        0      0       0     0        0      0  ...   
12 