# Indexer for Search Engine with Ranking

In [12]:
import pandas as pd 

## Import Websites from json file

In [13]:
websites = pd.read_json("data/table.json")
df = pd.DataFrame(websites)
df = df.reset_index()

## Create new Dictionary to store website on database with following schema

```
string: Website 
Website: {
  url: string,
  outgoingLinks: string[],
  incomingLinks: string[]
}
```

In [14]:
websites_dict = {}

for index, row in df.iterrows():
    url = row['url']
    if websites_dict.get(url) is None:
        websites_dict[url] = {
            "url": url,
            "outgoingLinks": list(set(row['metadata']['links'])),
            "outgoingLinksLen":len(row['metadata']['links']),
            "incomingLinks": [],
            "incomingLinksLen": 0 
        }

## Assign Incoming Links to the created dictionary

In [15]:
for website in websites_dict:
    links = websites_dict[website]["outgoingLinks"]
    for link in links:
        linkInDict = websites_dict.get(link)
        if linkInDict is not None:
            if not website in websites_dict[link]["incomingLinks"]: 
                websites_dict[link]["incomingLinks"].append(website)
                websites_dict[link]["incomingLinksLen"] = websites_dict[link]["incomingLinksLen"] + 1
            
            

website_pd = pd.DataFrame.from_dict(websites_dict)
website_pd.head(30).T

Unnamed: 0,url,outgoingLinks,outgoingLinksLen,incomingLinks,incomingLinksLen
https://nepal.gov.np,https://nepal.gov.np,[https://nepal.gov.np:8443/NationalPortal/NP?s...,3,[],0
https://moha.gov.np,https://moha.gov.np,"[https://moha.gov.np/#tabSecondact-regulation,...",114,[],0
https://p1.gov.np,https://p1.gov.np,"[http://moial.p1.gov.np/, https://p1.gov.np/no...",100,[],0
http://p2.gov.np,http://p2.gov.np,"[http://pga.p2.gov.np/, http://mowcys.p2.gov.n...",26,[],0
http://p3.gov.np,http://p3.gov.np,[],0,[],0
...,...,...,...,...,...
http://www.ugcnepal.edu.np/,http://www.ugcnepal.edu.np/,"[http://www.ugcnepal.edu.np/publications/1/11,...",118,[https://nepal.gov.np:8443/NationalPortal/view...,3
http://www.wecs.gov.np/,http://www.wecs.gov.np/,[http://www.wecs.gov.np/storage/listies/March2...,56,[https://nepal.gov.np:8443/NationalPortal/view...,5
http://www.nmc.org.np/,http://www.nmc.org.np/,"[https://nmc.org.np/chairman-s-message, https:...",33,[https://nepal.gov.np:8443/NationalPortal/view...,2
http://nhrc.gov.np/,http://nhrc.gov.np/,[https://nhrc.gov.np/trainings/training-worksh...,79,[https://nepal.gov.np:8443/NationalPortal/view...,6


## Save Dictionary Data to JSON file

In [8]:
website_pd.to_json("websites.json")

# Create List of Websites from dictionary

In [19]:
websites_list = list(websites_dict.values())


## Save Dictionary Data to MongoDB

### Initialize pymongo

In [6]:
import pymongo as pm
mongouri = "mongodb://root:prisma@localhost:27017/db_seven_sem_prj?authSource=admin"
client = pm.MongoClient(mongouri)
database = client.get_database()
print(database)

Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, authsource='admin'), 'db_seven_sem_prj')


In [7]:
website_collection = database['websites']

In [20]:
website_collection.insert_many(websites_list)

<pymongo.results.InsertManyResult at 0x7f26f6890cc0>

# Verify Data Inserted by showing the total rows in database

In [26]:
websites_in_db = database["websites"].find()
print(list(websites_in_db)[0])

{'_id': ObjectId('63da70ad1c267e8974c030b3'), 'url': 'https://nepal.gov.np', 'outgoingLinks': ['https://nepal.gov.np:8443/NationalPortal/NP?splashAction=business', 'https://nepal.gov.np:8443/NationalPortal/NP?splashAction=home', 'https://nepal.gov.np:8443/NationalPortal/NP?splashAction=citizen'], 'outgoingLinksLen': 3, 'incomingLinks': [], 'incomingLinksLen': 0}
