# C-More

### MongoDB

In [9]:
import pymongo

In [10]:
client = pymongo.MongoClient('mongodb://localhost:27017/')

#### 1. Create database

In [11]:
# create database rep_analysis_test

db = client['rep_analysis_test']

In MongoDB, a **database** is not created until it gets content.

#### 2. Create collection

In [12]:
# create search_words collection (similar to an RDBMS table)

search_words = db['search_words']

In [13]:
db.list_collection_names()

[]

In MongoDB, a **collection** is not created until it gets content.

In [14]:
# insert search words defined by each client into collection search_words

new_search_words = [{"_id": 1, "company": "Vodafone", "words": ["vodafone", "5G"]}, 
                    {"_id": 2, "company": "Santander", "words": ["santander", "card", "account", "loan", "banking"]}, 
                    {"_id": 3, "company": "BP", "words": ["bp", "shell", "repsol", "galp", "prio"]}]

result = search_words.insert_many(new_search_words)

In [15]:
result.inserted_ids

[1, 2, 3]

In [16]:
db.list_collection_names()

['search_words']

#### 3. Select data from a collection

In [17]:
for doc in search_words.find():
    print(doc)

{'_id': 1, 'company': 'Vodafone', 'words': ['vodafone', '5G']}
{'_id': 2, 'company': 'Santander', 'words': ['santander', 'card', 'account', 'loan', 'banking']}
{'_id': 3, 'company': 'BP', 'words': ['bp', 'shell', 'repsol', 'galp', 'prio']}


In [18]:
# find words for company vodafone

my_query = {"company": "Vodafone"}

for words in search_words.find(my_query, {"_id": 0, "company": 0}):
    print(words['words'])

['vodafone', '5G']


#### 4. Check collections after running the .py script to get twitter data

In [19]:
db.list_collection_names()

['search_words']

#### 5. Select data from the keywords collection

In [None]:
keywords = db['client_info']

In [20]:
keywords = db['keywords']

In [21]:
for doc in keywords.find():
    print(doc)

In [22]:
# total number of documents

keywords.count_documents({})

0

In [23]:
# create text index to perform $text queries
# https://stackoverflow.com/questions/33541290/how-can-i-create-an-index-with-pymongo
# https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_index

keywords.create_index([('text', pymongo.TEXT)])

'text_text'

In [24]:
cursor = keywords.find({"$text": {"$search": "vodafone"}}, {"text": 1, '_id': 0})

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


In [25]:
cursor = keywords.find({"$text": {"$search": "5G"}}, {"text": 1, '_id': 0})
    
i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


In [26]:
cursor = keywords.aggregate([{'$match': {"$text": {"$search": "5G"}}}, {"$count": "Number of documents"}])

for result in cursor:
    print(result)

For the words 'vodafone' and '5G' we have a total of 65 documents (5 + 60). However, we saw the total number of documents we retrieved was 69.

To understand this behaviour, we can use regular expressions to retrieve all the documents we are interested in.

In [27]:
my_query = {"text": {"$regex": "(V|v)odafone"}} # (V|v) to include lowercase and uppercase words

cursor = keywords.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


Using this regular expression, we get 6 documents instead of 5. This happens because one "extra" document is retrieved where Vodafone is not a whole word - @VodafoneUK.

To get only whole words we could use:

In [28]:
my_query = {"text": {"$regex": "\\b(V|v)odafone\\b"}}

cursor = keywords.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


We get the same 5 documents we got with our text search.

In [29]:
my_query = {"text": {"$regex": "5(G|g)"}}

cursor = keywords.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


For the word 5G, we get 63 documents. Adding the 6 we got for Vodafone, we get the 69 documents we initially retrieved.

To get only whole words:

In [30]:
my_query = {"text": {"$regex": "\\b5(G|g)\\b"}}

cursor = keywords.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


This time we get 61 documents (and not the 60 with got with our text search...).

In alternative, we can also use Python's re module.

In [31]:
import re

In [32]:
# ignore case

my_query = {"text": {"$regex": re.compile('vodafone', re.IGNORECASE)}}

cursor = keywords.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


In [33]:
# ignore case and include only whole words

my_query = {"text": {"$regex": re.compile('\\bvodafone\\b', re.IGNORECASE)}}

cursor = keywords.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


In [34]:
# ignore case

my_query = {"text": {"$regex": re.compile('5g', re.IGNORECASE)}}

cursor = keywords.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


In [35]:
# ignore case and include ony whole words

my_query = {"text": {"$regex": re.compile('\\b5g\\b', re.IGNORECASE)}}

cursor = keywords.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total
