# Search Engine

## Imports :

In [1]:
import gensim
import nltk

# Scraping Documents

### BeautifulSoup used for Scraping the website 

In [2]:
from bs4 import BeautifulSoup

### Request are used to get the html content from the website

In [3]:
import requests

### Used during error handling

In [4]:
from time import sleep

### For mysql connectivity

In [5]:
import mysql.connector

### Class used for scraping the data

In [6]:
class ScrapeData :
    docs=[]
    links=[]
    titles=[]
    
    # For scraping data from technologyreview.com website
    def downloadData(self,link) :
        response=requests.get(link)
        soup = BeautifulSoup(response.content, "lxml")
        print(soup.title.string)
        self.titles.append(soup.title.string)
        article_body=soup.find(attrs={"class": "article-body__content"})
        pTags=article_body.findChildren('p')
        p=''
        for pTag in pTags :
            #print(pTag.get_text())
            p+=pTag.get_text()
            #print('\n') 
        self.docs.append(p)
        self.links.append(link)
        
    def processTechnologyReview(self,baseUrl,categoryUrl) :

        response=requests.get(baseUrl+categoryUrl)

        soup = BeautifulSoup(response.content, "lxml")

        liTags=soup.find('li',attrs={"class": "tech"})

        articleTag=soup.find(attrs={"class": "article"})

        mainTag=soup.find('main')

        if liTags :

            while liTags :
                link =baseUrl+liTags.findChild('a').get('href')
                try:
                    self.downloadData(link)
                except requests.exceptions.MissingSchema:
                    print("Invalid Url ..")
                    print("Let me sleep for 5 seconds")
                    print("ZZzzzz...")
                    sleep(5)
                    print("Was a nice sleep, now let me continue...")
                    continue
                liTags=liTags.findNextSibling()

        elif articleTag:

            h3Tags=articleTag.findAll('h3')

            for h3Tag in h3Tags :
                if h3Tag.find('a') :
                    link =h3Tag.find('a').get('href')
                    try:
                        self.downloadData(link)
                    except requests.exceptions.MissingSchema:
                        print("Invalid Url ..")
                        print("Let me sleep for 5 seconds")
                        print("ZZzzzz...")
                        sleep(5)
                        print("Was a nice sleep, now let me continue...")
                        continue
        elif mainTag:

            liClass=mainTag.find('li',attrs={'class':'nav-li nav-li--with-big-dropdown'})
            ulClass=liClass.findChild('ul')
            anchorTags=ulClass.findChild('a')
            while anchorTags :
                link =baseUrl+anchorTags.get('href')
                print(link)
                try:
                    self.downloadData(link)
                except requests.exceptions.MissingSchema:
                    print("Invalid Url ..")
                    print("Let me sleep for 5 seconds")
                    print("ZZzzzz...")
                    sleep(5)
                    print("Was a nice sleep, now let me continue...")
                    continue
                anchorTags=anchorTags.findNextSibling()


### Instantiate the class to scrape the data

In [7]:
scrapeData = ScrapeData()
scrapeData.processTechnologyReview('https://www.technologyreview.com','/lists/technologies/2017/')
scrapeData.processTechnologyReview('https://www.technologyreview.com','/s/609839/our-best-stories-of-2017/')
scrapeData.processTechnologyReview('https://www.technologyreview.com','/lists/innovators-under-35/2017/')

Reversing Paralysis: 10 Breakthrough Technologies 2017 - MIT Technology Review
Self-Driving Trucks: 10 Breakthrough Technologies 2017 - MIT Technology Review
Paying With Your Face: 10 Breakthrough Technologies 2017 - MIT Technology Review
Practical Quantum Computers: 10 Breakthrough Technologies 2017 - MIT Technology Review
The 360-Degree Selfie: 10 Breakthrough Technologies 2017 - MIT Technology Review
Hot Solar Cells: 10 Breakthrough Technologies 2017 - MIT Technology Review
Gene Therapy 2.0: 10 Breakthrough Technologies 2017 - MIT Technology Review
The Cell Atlas: 10 Breakthrough Technologies 2017 - MIT Technology Review
Botnets of Things: 10 Breakthrough Technologies 2017 - MIT Technology Review
Reinforcement Learning: 10 Breakthrough Technologies 2017 - MIT Technology Review
Invalid Url ..
Let me sleep for 5 seconds
ZZzzzz...
Was a nice sleep, now let me continue...
Engineering the Perfect Astronaut - MIT Technology Review
Growing Up with Alexa - MIT Technology Review
The Growing 

## Dummy Documents :

### Considered List of Strings as Documents :

In [8]:
"""raw_documents = ["I'm taking the show on the road.",
                 "My socks are a force multiplier.",
             "I am the barber who cuts everyone's hair who doesnt cut their own.",
             "Legend has it that the mind is a mad monkey.",
            "I make my own fun."]
print("Number of documents:",len(raw_documents))"""

'raw_documents = ["I\'m taking the show on the road.",\n                 "My socks are a force multiplier.",\n             "I am the barber who cuts everyone\'s hair who doesnt cut their own.",\n             "Legend has it that the mind is a mad monkey.",\n            "I make my own fun."]\nprint("Number of documents:",len(raw_documents))'

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_df=0.5, max_features=10000,
                                 min_df=2, stop_words='english',
                                 use_idf='use_idf')
X = vectorizer.fit_transform(scrapeData.docs)

In [19]:
from sklearn.cluster import KMeans
km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1,
                verbose='--verbose')

NameError: name 'true_k' is not defined

### list to store the docs after removal of puntuations and stopwords

In [10]:
gen_docs=[]

### load nltk's English stopwords as variable called 'stopwords'

In [11]:
stopwords = nltk.corpus.stopwords.words('english')

### for removing puntuations( ' , . , etc)

In [12]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

In [13]:
import re
from nltk.tokenize import word_tokenize

#for text in raw_documents :
for text in scrapeData.docs :
    word=''
    for w in word_tokenize(text) :
        if re.search('[a-zA-Z]', w.lower()):
            if w.lower() not in stopwords :
                word+=w.lower()
                word+=' '
    gen_docs.append((tokenizer.tokenize(word)))

### stemmer is used to convert singular words to plural

In [14]:
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

### created a list to store the documents after stemming

In [15]:
stem_docs=[]

In [16]:
for text in gen_docs :
    stem_doc=[]
    for t in text :
        if len(t)>1 :
            stem_doc.append(stemmer.stem(t))
    stem_docs.append(stem_doc)
print(gen_docs)
print('\n')
print(stem_docs)



[['go', 'go', 'thought', 'race', 'grégoir', 'courtin', 'mind', 'the', 'french', 'neuroscientist', 'watch', 'macaqu', 'monkey', 'hunch', 'aggress', 'one', 'end', 'treadmil', 'team', 'use', 'blade', 'slice', 'halfway', 'anim', 'spinal', 'cord', 'paralyz', 'right', 'leg', 'courtin', 'want', 'prove', 'could', 'get', 'monkey', 'walk', 'colleagu', 'instal', 'record', 'devic', 'beneath', 'skull', 'touch', 'motor', 'cortex', 'sutur', 'pad', 'flexibl', 'electrod', 'around', 'anim', 'spinal', 'cord', 'injuri', 'wireless', 'connect', 'join', 'two', 'electron', 'devic', 'the', 'result', 'system', 'read', 'monkey', 'intent', 'move', 'transmit', 'immedi', 'form', 'burst', 'electr', 'stimul', 'spine', 'soon', 'enough', 'monkey', 'right', 'leg', 'began', 'move', 'extend', 'flex', 'extend', 'flex', 'hobbl', 'forward', 'monkey', 'think', 'boom', 'walk', 'recal', 'exult', 'courtin', 'professor', 'switzerland', 'école', 'polytechniqu', 'fédérale', 'de', 'lausann', 'in', 'recent', 'year', 'lab', 'anim', 

## Insert data into mysql

### Connectivity with mysql

In [17]:
try:
    cnx = mysql.connector.connect(user='root', password='',host='localhost',database='search-engine',port='3306')
except mysql.connector.Error as err:
    if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
        print("Something is wrong with your user name or password")
    elif err.errno == errorcode.ER_BAD_DB_ERROR:
        print("Database does not exist")
    else:
        print(err)

NameError: name 'errorcode' is not defined

### Cursor to manage the data

In [None]:
cursor = cnx.cursor()

### Insertion into table

In [None]:
i=0
while i<len(gen_docs) :
    keywords=' '
    for gen_doc in gen_docs[i] :
        keywords+=gen_doc
        keywords+=' '
        query = "INSERT INTO weblinks (url, keywords, title) VALUES ('%s',%r,'%s')" % (scrapeData.links[i],keywords, scrapeData.titles[i])
    try :
        cursor.execute(query)
    except mysql.connector.Error as err:
        print(err)
        i+=1
        continue
    i+=1

### Commit the changes made

In [None]:
cnx.commit()

### Close the connection

In [None]:
cnx.close()

### created dictonary of number of words

In [None]:
dictionary = gensim.corpora.Dictionary(stem_docs)
print(dictionary[1])
print(dictionary.token2id['take'])
print("Number of words in dictionary:",len(dictionary))
for i in range(len(dictionary)):
    print(i, dictionary[i])

### created a corpus which will contain the mapping of the word to the dictionary of each document.

In [None]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in stem_docs]
print(corpus)

### Now we create a tf-idf model from the corpus. 
### num_nnz is the number of tokens.

In [None]:
tf_idf = gensim.models.TfidfModel(corpus)
print(tf_idf)
s = 0
for i in corpus:
    s += len(i)
print(s)

In [None]:
sims = gensim.similarities.Similarity('/home/nsaxena/Documents',tf_idf[corpus],
                                      num_features=len(dictionary))

print(sims)
print(type(sims))

In [None]:
query="hacking attacks"
word=''
for w in word_tokenize(query) :
        # include only words and in lower case
        if re.search('[a-zA-Z]', w.lower()):
            # for removing common words(the,i,etc)
            if w.lower() not in stopwords :
                word+=w.lower()
                word+=' '
query_doc=(tokenizer.tokenize(word))
query_stem_doc=[]
for t in query_doc :
    if len(t)>1 :
        query_stem_doc.append(stemmer.stem(t))
print(query_stem_doc)
query_doc_bow = dictionary.doc2bow(query_stem_doc)
print(query_doc_bow)
query_doc_tf_idf = tf_idf[query_doc_bow]
print(query_doc_tf_idf)

In [None]:
sims.num_best = 10
similar=sims[query_doc_tf_idf]

In [None]:
for sim in similar :
    print( scrapeData.titles[sim[0]])
    print( scrapeData.links[sim[0]])