### 1. Import Libraries

In [None]:
# for text preprocessing
import re
import spacy

from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import string

# import vectorizers
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# import numpy for matrix operation
import numpy as np

# import LDA from sklearn
from sklearn.decomposition import LatentDirichletAllocation

In [None]:
# to suppress warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
nlp = spacy.load('en_core_web_sm')

In [None]:
import gensim
import nltk
from gensim import corpora
from nltk.corpus import stopwords
from nltk import WordNetLemmatizer
import string

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
# Apply Preprocessing on the Corpus
# stop Loss words
doc = open("/content/dataset.txt")
doc=doc.read()
stop = set (stopwords.words('english'))
# punctuation
exclude = set(string.punctuation)
# Lemmatization
lemma = WordNetLemmatizer()
corpus = nltk.tokenize.sent_tokenize(doc)
print(corpus)
# One function for all the steps:
def clean(doc):
    # convert text into Lower case + split into words
    stop_free =" ".join([i for i in doc.lower ().split() if i not in stop])
    # remove any stop words present
    punc_free =' '.join(ch for ch in stop_free if ch not in exclude)
    # remove punctuations + normalize the text
    normalized =" ".join(lemma.lemmatize(word) for word in punc_free.split())
    return normalized
# clean data stored in a new List
# clean_corpus = [clean(doc).split() for doc in corpus]

['Aerodynamics, from Greek ἀήρ aero (air) + δυναμική (dynamics), is the study of the motion of air, particularly when affected by a solid object, such as an airplane wing.', 'It involves topics covered in the field of fluid dynamics and its subfield of gas dynamics.The term aerodynamics is often used synonymously with gas dynamics, the difference being that "gas dynamics" applies to the study of the motion of all gases, and is not limited to air.', 'The formal study of aerodynamics began in the modern sense in the eighteenth century, although observations of fundamental concepts such as aerodynamic drag were recorded much earlier.', 'Most of the early efforts in aerodynamics were directed toward achieving heavier-than-air flight, which was first demonstrated by Otto Lilienthal in 1891.', '[1] Since then, the use of aerodynamics through mathematical analysis, empirical approximations, wind tunnel experimentation, and computer simulations has formed a rational basis for the development o

In [None]:
# the complete corpus as below:

corpus

['Aerodynamics, from Greek ἀήρ aero (air) + δυναμική (dynamics), is the study of the motion of air, particularly when affected by a solid object, such as an airplane wing.',
 'It involves topics covered in the field of fluid dynamics and its subfield of gas dynamics.The term aerodynamics is often used synonymously with gas dynamics, the difference being that "gas dynamics" applies to the study of the motion of all gases, and is not limited to air.',
 'The formal study of aerodynamics began in the modern sense in the eighteenth century, although observations of fundamental concepts such as aerodynamic drag were recorded much earlier.',
 'Most of the early efforts in aerodynamics were directed toward achieving heavier-than-air flight, which was first demonstrated by Otto Lilienthal in 1891.',
 '[1] Since then, the use of aerodynamics through mathematical analysis, empirical approximations, wind tunnel experimentation, and computer simulations has formed a rational basis for the developme

### 2. Text Preprocessing

Steps to preprocess text data:

1. Convert the text into lowercase
2. Split text into words
3. Remove the stop loss words
3. Remove the Punctuation, any symbols and special characters
4. Normalize the word (I'll be using Lemmatization for normalization)

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Apply Preprocessing on the Corpus

# stop loss words 
stop = set(stopwords.words('english'))

# punctuation 
exclude = set(string.punctuation) 

# lemmatization
lemma = WordNetLemmatizer() 

# One function for all the steps:
def clean(doc):
    
    # convert text into lower case + split into words
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    
    # remove any stop words present
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)  
    
    # remove punctuations + normalize the text
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())  
    return normalized

# clean data stored in a new list
clean_corpus = [clean(doc).split() for doc in corpus]   

In [None]:
clean_corpus

[['aerodynamics',
  'greek',
  'ἀήρ',
  'aero',
  'air',
  'δυναμική',
  'dynamic',
  'study',
  'motion',
  'air',
  'particularly',
  'affected',
  'solid',
  'object',
  'airplane',
  'wing'],
 ['involves',
  'topic',
  'covered',
  'field',
  'fluid',
  'dynamic',
  'subfield',
  'gas',
  'dynamicsthe',
  'term',
  'aerodynamics',
  'often',
  'used',
  'synonymously',
  'gas',
  'dynamic',
  'difference',
  'gas',
  'dynamic',
  'applies',
  'study',
  'motion',
  'gas',
  'limited',
  'air'],
 ['formal',
  'study',
  'aerodynamics',
  'began',
  'modern',
  'sense',
  'eighteenth',
  'century',
  'although',
  'observation',
  'fundamental',
  'concept',
  'aerodynamic',
  'drag',
  'recorded',
  'much',
  'earlier'],
 ['early',
  'effort',
  'aerodynamics',
  'directed',
  'toward',
  'achieving',
  'heavierthanair',
  'flight',
  'first',
  'demonstrated',
  'otto',
  'lilienthal',
  '1891'],
 ['1',
  'since',
  'then',
  'use',
  'aerodynamics',
  'mathematical',
  'analysis',

### 3. Convert Text into Numerical Representation

Converting the clean preprocessed corpus to array

In [None]:
# Converting text into numerical representation
tf_idf_vectorizer = TfidfVectorizer(tokenizer=lambda doc: doc, lowercase=False)

# Converting text into numerical representation
cv_vectorizer = CountVectorizer(tokenizer=lambda doc: doc, lowercase=False)

In [None]:
# Array from TF-IDF Vectorizer 
tf_idf_arr = tf_idf_vectorizer.fit_transform(clean_corpus)

# Array from Count Vectorizer 
cv_arr = cv_vectorizer.fit_transform(clean_corpus)

In [None]:
# this is our converted text to numerical representation from the Tf-IDF vectorizer

tf_idf_arr

<6x90 sparse matrix of type '<class 'numpy.float64'>'
	with 102 stored elements in Compressed Sparse Row format>

In [None]:
# this is our converted text to numerical representation from the Count vectorizer
cv_arr

<6x90 sparse matrix of type '<class 'numpy.int64'>'
	with 102 stored elements in Compressed Sparse Row format>

The corpus has 52 columns and 5 rows corresponding to our document and 58 represents the unique Vocabulary present in our corpus.

In [None]:
# Creating vocabulary array which will represent all the corpus 
vocab_tf_idf = tf_idf_vectorizer.get_feature_names()

# get the vocb list
vocab_tf_idf

['1',
 '1891',
 'achieving',
 'aero',
 'aerodynamic',
 'aerodynamics',
 'affected',
 'air',
 'airplane',
 'although',
 'analysis',
 'applies',
 'approximation',
 'basis',
 'become',
 'began',
 'boundary',
 'century',
 'compressible',
 'computational',
 'computer',
 'concept',
 'covered',
 'demonstrated',
 'development',
 'difference',
 'directed',
 'drag',
 'dynamic',
 'dynamicsthe',
 'earlier',
 'early',
 'effort',
 'eighteenth',
 'empirical',
 'experimentation',
 'field',
 'first',
 'flight',
 'flow',
 'fluid',
 'focused',
 'formal',
 'formed',
 'fundamental',
 'gas',
 'greek',
 'heavierthanair',
 'increasingly',
 'involves',
 'issue',
 'layer',
 'lilienthal',
 'limited',
 'mathematical',
 'modern',
 'motion',
 'much',
 'nature',
 'number',
 'object',
 'observation',
 'often',
 'otto',
 'particularly',
 'rational',
 'recent',
 'recorded',
 'related',
 'sense',
 'simulation',
 'since',
 'solid',
 'study',
 'subfield',
 'synonymously',
 'technology',
 'term',
 'then',
 'topic',
 'towar

In [None]:
# Creating vocabulary array which will represent all the corpus 
vocab_cv = cv_vectorizer.get_feature_names()

# get the vocb list
vocab_cv

['1',
 '1891',
 'achieving',
 'aero',
 'aerodynamic',
 'aerodynamics',
 'affected',
 'air',
 'airplane',
 'although',
 'analysis',
 'applies',
 'approximation',
 'basis',
 'become',
 'began',
 'boundary',
 'century',
 'compressible',
 'computational',
 'computer',
 'concept',
 'covered',
 'demonstrated',
 'development',
 'difference',
 'directed',
 'drag',
 'dynamic',
 'dynamicsthe',
 'earlier',
 'early',
 'effort',
 'eighteenth',
 'empirical',
 'experimentation',
 'field',
 'first',
 'flight',
 'flow',
 'fluid',
 'focused',
 'formal',
 'formed',
 'fundamental',
 'gas',
 'greek',
 'heavierthanair',
 'increasingly',
 'involves',
 'issue',
 'layer',
 'lilienthal',
 'limited',
 'mathematical',
 'modern',
 'motion',
 'much',
 'nature',
 'number',
 'object',
 'observation',
 'often',
 'otto',
 'particularly',
 'rational',
 'recent',
 'recorded',
 'related',
 'sense',
 'simulation',
 'since',
 'solid',
 'study',
 'subfield',
 'synonymously',
 'technology',
 'term',
 'then',
 'topic',
 'towar

In [None]:
display(len(vocab_tf_idf))
display(len(vocab_cv))

90

90

### 4. Implementation of LDA

To implement LDA, pass the corpus: document-term matrix to the model. We had above obtained the unique words of vocabulary using both TF-IDF and Count Vectorizer. We can continue with either as have the same unique words in both the obtained vocabularies.

In [None]:
 # Implementation of LDA:
    
# Create object for the LDA class 
# Inside this class LDA: define the components:
lda_model = LatentDirichletAllocation(n_components = 6, max_iter = 20, random_state = 20)

# fit transform on model on our count_vectorizer : running this will return our topics 
X_topics = lda_model.fit_transform(tf_idf_arr)

# .components_ gives us our topic distribution 
topic_words = lda_model.components_

### 4a. Retrieve the Topics


In [None]:
#  Define the number of Words that we want to print in every topic : n_top_words
n_top_words = 7

for i, topic_dist in enumerate(topic_words):
    
    # np.argsort to sorting an array or a list or the matrix acc to their values
    sorted_topic_dist = np.argsort(topic_dist)
    
    # Next, to view the actual words present in those indexes we can make the use of the vocab created earlier
    topic_words = np.array(vocab_tf_idf)[sorted_topic_dist]
    
    # so using the sorted_topic_indexes we ar extracting the words from the vocabulary
    # obtaining topics + words
    # this topic_words variable contains the Topics  as well as the respective words present in those Topics
    topic_words = topic_words[:-n_top_words:-1]
    print ("Topic", str(i+1), topic_words)

Topic 1 ['gas' 'dynamic' 'air' 'study' 'aerodynamics' 'motion']
Topic 2 ['aerodynamics' 'heavierthanair' 'flight' 'involves' 'term' 'covered']
Topic 3 ['demonstrated' 'toward' 'otto' 'early' 'effort' 'lilienthal']
Topic 4 ['aerodynamics' 'heavierthanair' 'flight' 'involves' 'term' 'covered']
Topic 5 ['aerodynamics' 'heavierthanair' 'flight' 'involves' 'term' 'covered']
Topic 6 ['compressible' 'become' 'flow' 'increasingly' 'issue' 'layer']


Above is the words per topic. The result is not so accurate as the data was less. More data will give more accurate result.

### 4b. Annotating the topics the documents

In [None]:
# To view what topics are assigned to the douments:

doc_topic = lda_model.transform(tf_idf_arr)  

# iterating over ever value till the end value
for n in range(doc_topic.shape[0]):
    
    # argmax() gives maximum index value
    topic_doc = doc_topic[n].argmax()
    
    # document is n+1  
    print ("Document", n+1, " -- Topic:" ,topic_doc)

Document 1  -- Topic: 0
Document 2  -- Topic: 0
Document 3  -- Topic: 0
Document 4  -- Topic: 2
Document 5  -- Topic: 0
Document 6  -- Topic: 5


This is the final output which gives us the topic along with the documents.

-----------------------

In [None]:
import http.client
s1 = "['"
s1 = s1 + "',_'".join(vocab_cv)
s1 = s1 + "]"
conn = http.client.HTTPSConnection("google-search3.p.rapidapi.com")

headers = {
    'X-User-Agent': "desktop",
    'X-Proxy-Location': "EU",
    'X-RapidAPI-Host': "google-search3.p.rapidapi.com",
    'X-RapidAPI-Key': "f62bd26370msh21382210477831cp122bfcjsn463c8f5cc60a"
    }

conn.request("GET", "/api/v1/search/q="+s1, headers=headers)

res = conn.getresponse()
data = res.read()
answer1 = data.decode("utf-8")
print(answer1)

{"results":[{"title":"Ethernet Interface Module Hardware and Common Platform ...","link":"https://process.honeywell.com/bin/edam/getfileservlet?id=06IqFSggTa5PDWj/9KcMuVmZlUorFshGoC2cHPkRgZU3mCnQmKd9/xp0+481QqJiIiu7vFc471vfTyuoB4fqInawyyZc/yEj8QrPH0yQbxFVKbMcM7u1JMV+c5NWPtKGJr/m/bXgCbnwW44kCXD2EuYetekhXceaDBP/e6PFZTnwzVsayXn8PPrK0llgWpak0Zy8+RoXXAZdGXBEU6JgLcbC75APgH1mS58oEXuMATjq1R0lUD0wteI05Uk3vPl7Ar3SNmKY2zPWc06BPp/EbEVoH9coCd1IF0V7lFLGsQbmdYSY3EzBxgxsGUU2uuq/+Hv1LeHH89jWz2RQfEBOtdCaU+o1YgT1hNo0Tdfph0s2rmD/drMdS4e5B3A=","description":"7.1 EIM Module Configuration Rules. 35. 7.2 Creating an EIM block in Control Builder. 36. 7.2.1 Prerequisites. 37. 7.2.2 Create and configure nonredundant ...","additional_links":[{"text":"Ethernet Interface Module Hardware and Common Platform ...https://process.honeywell.com › bin › getfileservlet","href":"https://process.honeywell.com/bin/edam/getfileservlet?id=06IqFSggTa5PDWj/9KcMuVmZlUorFshGoC2cHPkRgZU3mCnQmKd9/xp0+481QqJiIiu7vFc471vfTyuoB4fqInawyy

In [None]:

import json
my_json = json.loads(answer1)
my_json

In [None]:
import csv
 
 
# Opening JSON file and loading the data
# into the variable data
with open('/content/answers.json','w') as json_file:
    json_file.write(answer1)
 
results = my_json['results']
 
# now we will open a file for writing
data_file = open('data_file.csv', 'w')
 
# create the csv writer object
csv_writer = csv.writer(data_file)
 
# Counter variable used for writing
# headers to the CSV file
count = 0
 
for resu in results:
    if count == 0:
 
        # Writing headers of CSV file
        header = resu.keys()
        csv_writer.writerow(header)
        count += 1
 
    # Writing data of CSV file
    csv_writer.writerow(resu.values())
 
data_file.close()