# C-More

### MongoDB

In [1]:
import pymongo

In [2]:
client = pymongo.MongoClient('mongodb://localhost:27017/')

#### 1. Create database

In [3]:
# create database rep_analysis_new

db = client['rep_analysis_new']

In MongoDB, a **database** is not created until it gets content.

#### 2. Create collection

In [4]:
# create client_info collection (similar to an RDBMS table)

client_info = db['client_info']

In [10]:
db.list_collection_names()

[]

In MongoDB, a **collection** is not created until it gets content.

In [11]:
# insert info defined by each client into collection client_info

new_client = {
    "company_name": "McDonald's", 
    "alternative_names": ["McDonald", "McDonalds", "Mac"], 
    "handles": [
        {
            "type": "twitter",
            "name": "McDonalds"
        },
        {
            "type": "facebook",
            "name": "..."
        },
        {
            "type": "linkedin",
            "name": "..."
        }
    ], 
    "language": "en", 
    "search_terms": ["burger", "fries", "cheese", "bacon"], 
    "competitors": ["Burger King", "KFC"]
}

result = client_info.insert_one(new_client)

In [13]:
result.inserted_id

ObjectId('6332d3537f6127fd30d1e241')

**P1:** deixamos estes ids serem definidos automaticamente?

**R1:** Sim.

In [14]:
db.list_collection_names()

['client_info']

#### 3. Select data from a collection

In [15]:
for doc in client_info.find():
    print(doc)

{'_id': ObjectId('6332d3537f6127fd30d1e241'), 'company_name': "McDonald's", 'alternative_names': ['McDonald', 'McDonalds', 'Mac'], 'handles': [{'type': 'twitter', 'name': 'McDonalds'}, {'type': 'facebook', 'name': '...'}, {'type': 'linkedin', 'name': '...'}], 'language': 'en', 'search_terms': ['burger', 'fries', 'cheese', 'bacon'], 'competitors': ['Burger King', 'KFC']}


In [19]:
my_query = {"company_name": "McDonald's"}

for client in client_info.find(my_query, {"_id": 0, 
                                          "alternative_names": 0, 
                                          "handles": 0}):
    print(client)

{'company_name': "McDonald's", 'language': 'en', 'search_terms': ['burger', 'fries', 'cheese', 'bacon'], 'competitors': ['Burger King', 'KFC']}


In [20]:
# find search_terms for company McDonald's

my_query = {"company_name": "McDonald's"}

for client in client_info.find(my_query):
    print(client['search_terms'])

['burger', 'fries', 'cheese', 'bacon']


#### 4. Check collections after running the .py script to get twitter data

In [29]:
# we've decided to rename the collection to data_twitter (it was named twitter_info)

db['twitter_info'].rename('data_twitter')

{'ok': 1.0}

In [6]:
db.list_collection_names()

['data_twitter', 'client_info']

#### 5. Select data from the data_twitter collection

In [8]:
twitter_info = db['data_twitter']

In [32]:
for doc in twitter_info.find():
    print(doc)

{'_id': ObjectId('6332ded8239d27df11bd2017'), 'created_at': '2022-09-26T23:29:55.000Z', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}, 'text': 'Factory reset your circadian rhythm 101:\nEat something super unhealthy and inflammatory at 9pm like pizza/instant noodles/burger with tons of store bought mayo.\nSleep when you pass out.\n\nTA-DAAA !!', 'id': '1574541810738659328', 'lang': 'en', 'search_word': 'burger'}
{'_id': ObjectId('6332ded8239d27df11bd2018'), 'created_at': '2022-09-26T23:29:49.000Z', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': '@Buffpup_ Me looking at that burger like... https://t.co/5P8HH2uhLs', 'id': '1574541785900384256', 'lang': 'en', 'search_word': 'burger'}
{'_id': ObjectId('6332ded8239d27df11bd2019'), 'created_at': '2022-09-26T23:29:48.000Z', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}, 'text': '@FearTheFloof As a f

In [33]:
# total number of documents

twitter_info.count_documents({})

1276

We can create a text index to perform $text queries (https://www.mongodb.com/docs/manual/text-search/#on-premises-text-search).

In [17]:
twitter_info.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)]}}

In [18]:
# create text index to perform $text queries
# https://stackoverflow.com/questions/33541290/how-can-i-create-an-index-with-pymongo
# https://pymongo.readthedocs.io/en/stable/api/pymongo/collection.html#pymongo.collection.Collection.create_index

twitter_info.create_index([('text', pymongo.TEXT)])

'text_text'

In [19]:
twitter_info.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)]},
 'text_text': {'v': 2,
  'key': [('_fts', 'text'), ('_ftsx', 1)],
  'weights': SON([('text', 1)]),
  'default_language': 'english',
  'language_override': 'language',
  'textIndexVersion': 3}}

In [21]:
cursor = twitter_info.find({"$text": {"$search": "McDonald's"}}, {"text": 1})

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')

I love me some reheated McDonald’s French fries

-----
@SjamaanN McDonald's Fries
Wendy's Baconator
Wendy's Frosty

-----
Jigen being a chronic smoker and only eating fries from McDonald's when they go is killing me

-----
McDonalds Hot Mustard &gt; Burger King Hot Mustard

-----
@Splenda @McDonalds Coffee with cream and a bacon, egg and cheese biscuit #NationalCoffeeDay

-----
@Splenda @McDonalds Coffee with cream and a bacon, egg and cheese biscuit #NationalCoffeeDay

-----
McDonald’s fries n a blunt&gt;&gt;&gt; 😩😩😩

-----
this orange soda at this cheese steak place is like a fanta equivalent for mcdonalds sprite

-----
@MadScientistFF Wendy’s baconator
McDonald’s fries
Cotton candy blizzard

-----
@Splenda @McDonalds Iced sugar free vanilla coffee with a Sausage egg and cheese biscuit.

-----
Just ate a 20 piece and 2 large fries from McDonald’s. Will I die?

-----
I love eating McDonalds double cheeseburger extra pickles with French fries extra salt

-----
@Splenda @McDonalds I alw

We get results for "McDonald's", "McDonald’s" and "McDonalds" (31 documents in total).

For a better understanding of the results we get, we can use regular expressions instead. We can also drop the index if we don't intend to use it.

In [22]:
twitter_info.drop_index("text_text")

In [23]:
twitter_info.index_information()

{'_id_': {'v': 2, 'key': [('_id', 1)]}}

In [24]:
import re

In [25]:
my_query = {"text": {"$regex": re.compile("mcdonald's", re.IGNORECASE)}}

cursor = twitter_info.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')

Now they want me to run to front of bus when they have pay station more then once on bus that have usb dumb mad I went to McDonald's to get a free burger I did not tell them to give out 1500 main points if u download app and free shit dummies retards man
-----
@JBPritzker Have you also noticed that retailers are replacing their cashiers with self service registers? Next will be McDonald's and Burger King, using Kiosks to replace their counter people.
-----
Went to Hardee's today just to check it out . It wasn't very good. The cheese burger was cold and the cheese was too and my Wife and I were the only ones in the restaurant. I had a chocolate shake and McDonald's better. I give Hardee's one star. Probably won't try it again.
-----
@SjamaanN McDonald's Fries
Wendy's Baconator
Wendy's Frosty
-----
Jigen being a chronic smoker and only eating fries from McDonald's when they go is killing me
-----
Went to Hardee's today just to check it out . It wasn't very good. The cheese burger was col

In [26]:
my_query = {"text": {"$regex": re.compile("mcdonalds", re.IGNORECASE)}}

cursor = twitter_info.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')

McDonalds Hot Mustard &gt; Burger King Hot Mustard
-----
mcdonalds (fries served moldy, inside of nuggets, burger served with raw meat, inside ice cream machine still in use) https://t.co/pLkgYouMWg
-----
ON OCTOBER 6TH, LET'S ALL GO TO MCDONALDS AND ASK FOR THE LUPIN BURGER

THE LOOK ON THE WORKERS FACE WILL BE AWESOME! https://t.co/7e5MCTELIr
-----
LUPIN MEAL AT MCDONALDS?!?!?! I'M GONNA CRY I NEED TO HAVE ONE.. I NEED TO GO TO JAPAN TO GET THE LUPIN BURGER https://t.co/7e5MCTELIr
-----
I love eating McDonalds double cheeseburger extra pickles with French fries extra salt
-----
@nathaliejacoby1 Nope. No way and no how. O'Rourke is a clown and not even qualified to ask "you want fries with that" at the local McDonalds. He will never be a Governor, nor does he deserve to.
-----
@Grubhub Hello, I just ordered food from the local @McDonalds thru the Grubhub app and it looks like the contents were tampered with. I can't receive a refund for total since I refunded the fries. This is quite 

In [27]:
my_query = {"text": {"$regex": re.compile("mcdonald’s", re.IGNORECASE)}}

cursor = twitter_info.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')

I hate my cravings when Mother Nature be in town. It’s always a double cheese burger from McDonald’s. I haven’t given in to these cravings since college. But right now?! It’s killing me 😭😭
-----
I love me some reheated McDonald’s French fries
-----
@MadScientistFF Wendy’s baconator
McDonald’s fries
Cotton candy blizzard
-----
Just ate a 20 piece and 2 large fries from McDonald’s. Will I die?
-----
McDonald’s fries n a blunt&gt;&gt;&gt; 😩😩😩
-----
I hate my cravings when Mother Nature be in town. It’s always a double cheese burger from McDonald’s. I haven’t given in to these cravings since college. But right now?! It’s killing me 😭😭
-----

 --> 6 documents in total


The 6 + 19 + 6 documents that we get add to 31 documents (the same number that we got with our text search).

**P2:** É preferível usar expressões regulares ou queremos usar o "text index" do MongoDB?

**R2:** Usar expressões regulares.

If we would like to use two conditions:

In [44]:
my_query = {"$and": [{"text": {"$regex": re.compile("mcdonald’s", re.IGNORECASE)}}, 
                     {"search_word": "burger"}]}

cursor = twitter_info.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')

I hate my cravings when Mother Nature be in town. It’s always a double cheese burger from McDonald’s. I haven’t given in to these cravings since college. But right now?! It’s killing me 😭😭
-----

 --> 1 documents in total


In [45]:
my_query = {"$and": [{"text": {"$regex": re.compile("mcdonald’s", re.IGNORECASE)}}, 
                     {"search_word": "fries"}]}

cursor = twitter_info.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')

I love me some reheated McDonald’s French fries
-----
@MadScientistFF Wendy’s baconator
McDonald’s fries
Cotton candy blizzard
-----
Just ate a 20 piece and 2 large fries from McDonald’s. Will I die?
-----
McDonald’s fries n a blunt&gt;&gt;&gt; 😩😩😩
-----

 --> 4 documents in total


In [46]:
my_query = {"$and": [{"text": {"$regex": re.compile("mcdonald’s", re.IGNORECASE)}}, 
                     {"search_word": "cheese"}]}

cursor = twitter_info.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')

I hate my cravings when Mother Nature be in town. It’s always a double cheese burger from McDonald’s. I haven’t given in to these cravings since college. But right now?! It’s killing me 😭😭
-----

 --> 1 documents in total


Este tweet é repetido, mas aparece duas vezes no total de 31 - uma vez com a palavra "burger" e outra com a palavra "cheese".

**P3:** Esta repetição pode ser problemática?

In [47]:
my_query = {"$and": [{"text": {"$regex": re.compile("mcdonald’s", re.IGNORECASE)}}, 
                     {"search_word": "bacon"}]}

cursor = twitter_info.find(my_query)

i=0

for x in cursor:
    i+=1
    print(x['text']+'\n'+'-----')
    
print('\n --> ' + str(i) + ' documents in total')


 --> 0 documents in total


#### 6. Load data from the data_twitter collection into a dataframe

In [15]:
# select tweets posted on 2022-09-26 after 23:29

my_query = {"created_at": {"$gt": "2022-09-26T23:29:00.000Z"}}
            
cursor = twitter_info.find(my_query)

for x in cursor:
    print(x)

{'_id': ObjectId('6332ded8239d27df11bd2017'), 'created_at': '2022-09-26T23:29:55.000Z', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}, 'text': 'Factory reset your circadian rhythm 101:\nEat something super unhealthy and inflammatory at 9pm like pizza/instant noodles/burger with tons of store bought mayo.\nSleep when you pass out.\n\nTA-DAAA !!', 'id': '1574541810738659328', 'lang': 'en', 'search_word': 'burger'}
{'_id': ObjectId('6332ded8239d27df11bd2018'), 'created_at': '2022-09-26T23:29:49.000Z', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'quote_count': 0}, 'text': '@Buffpup_ Me looking at that burger like... https://t.co/5P8HH2uhLs', 'id': '1574541785900384256', 'lang': 'en', 'search_word': 'burger'}
{'_id': ObjectId('6332ded8239d27df11bd2019'), 'created_at': '2022-09-26T23:29:48.000Z', 'public_metrics': {'retweet_count': 0, 'reply_count': 0, 'like_count': 1, 'quote_count': 0}, 'text': '@FearTheFloof As a f

In [16]:
import pandas as pd

In [27]:
cursor = twitter_info.find(my_query)

list_cursor = list(cursor)

df = pd.DataFrame(list_cursor)

In [28]:
df.head()

Unnamed: 0,_id,created_at,public_metrics,text,id,lang,search_word
0,6332ded8239d27df11bd2017,2022-09-26T23:29:55.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Factory reset your circadian rhythm 101:\nEat ...,1574541810738659328,en,burger
1,6332ded8239d27df11bd2018,2022-09-26T23:29:49.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",@Buffpup_ Me looking at that burger like... ht...,1574541785900384256,en,burger
2,6332ded8239d27df11bd2019,2022-09-26T23:29:48.000Z,"{'retweet_count': 0, 'reply_count': 0, 'like_c...","@FearTheFloof As a fellow child of the 80s, I ...",1574541780405702670,en,burger
3,6332ded8239d27df11bd201a,2022-09-26T23:29:39.000Z,"{'retweet_count': 7, 'reply_count': 137, 'like...",Which 3 toppings are you adding to your Burger...,1574541743584088064,en,burger
4,6332ded8239d27df11bd201b,2022-09-26T23:29:27.000Z,"{'retweet_count': 0, 'reply_count': 2, 'like_c...",@Smollbrainarti2 @kawaii_blue368 You should al...,1574541692476174336,en,burger


In [29]:
df['retweets'] = df['public_metrics'].map(lambda x: x['retweet_count'])
df['replies'] = df['public_metrics'].map(lambda x: x['reply_count'])
df['likes'] = df['public_metrics'].map(lambda x: x['like_count'])
df['quotes'] = df['public_metrics'].map(lambda x: x['quote_count'])

df.drop('public_metrics', axis=1, inplace=True)

df.head()

Unnamed: 0,_id,created_at,text,id,lang,search_word,retweets,replies,likes,quotes
0,6332ded8239d27df11bd2017,2022-09-26T23:29:55.000Z,Factory reset your circadian rhythm 101:\nEat ...,1574541810738659328,en,burger,0,0,1,0
1,6332ded8239d27df11bd2018,2022-09-26T23:29:49.000Z,@Buffpup_ Me looking at that burger like... ht...,1574541785900384256,en,burger,0,0,0,0
2,6332ded8239d27df11bd2019,2022-09-26T23:29:48.000Z,"@FearTheFloof As a fellow child of the 80s, I ...",1574541780405702670,en,burger,0,0,1,0
3,6332ded8239d27df11bd201a,2022-09-26T23:29:39.000Z,Which 3 toppings are you adding to your Burger...,1574541743584088064,en,burger,7,137,79,26
4,6332ded8239d27df11bd201b,2022-09-26T23:29:27.000Z,@Smollbrainarti2 @kawaii_blue368 You should al...,1574541692476174336,en,burger,0,2,3,0


#### 7. Check collections after running the .py script to extract keywords

In [47]:
db.list_collection_names()

['data_twitter', 'client_info', 'kw_freq_weight']

#### 8. Select data from the kw_freq_weight collection

In [48]:
 kw_freq_weight = db['kw_freq_weight']

In [49]:
# keywords with term frequency

for doc in kw_freq_weight.find():
    print(doc)

{'_id': ObjectId('633eaf73c37500096947803b'), 'cheese': 728, 'burger': 276, 'bacon': 229, 'fries': 219, 'like': 150, 'mac': 80, 'chicken': 68, 'good': 65, 'eat': 58, 'get': 56, 'love': 53, '️': 53, 'cream': 49, 'go': 46, 'one': 45, '😂': 45, 'eating': 43, 'sandwich': 43, 'want': 42, 'grilled': 41, 'made': 40, 'got': 39, 'egg': 38, 'would': 37, 'n': 37, 'king': 36, 'time': 33, 'know': 33, 'dinner': 32, 'make': 32, '😭': 31, 'french': 31, 'also': 30, '2': 30, 'still': 30, 'pizza': 30, 'nothing': 29, '“': 28, 'sauce': 27, 'day': 27, '”': 27, 'fried': 26, 'think': 26, 'put': 25, 'say': 25, 'people': 24, 'meat': 24, 'really': 23, 'well': 23, 'best': 23, 'cheddar': 22, 'pepper': 22, 'need': 22, 'bread': 22, 'bagel': 21, 'even': 21, 'hot': 21, 'food': 21, 'coffee': 20, 'shit': 20, "i'm": 20, 'way': 20, 'ate': 20, '1': 19, 'new': 19, 'soup': 19, 'always': 19, 'better': 19, 'went': 19, 'two': 19, 'ham': 19, 'lol': 18, '☕': 18, 'ass': 18, 'top': 18, 'guy': 18, 'back': 18, 'could': 17, 'maybe': 17,

In [50]:
# drop collection

kw_freq_weight.drop()

In [51]:
db.list_collection_names()

['data_twitter', 'client_info']

We will now run the .py script to get the term weights.

In [53]:
db.list_collection_names()

['kw_freq_weight', 'data_twitter', 'client_info']

In [54]:
kw_freq_weight = db['kw_freq_weight']

In [55]:
# keywords with term weights (textrank)
for doc in kw_freq_weight.find():
    print(doc)

{'_id': ObjectId('633eb01f6ea77c9738476ffb'), 'cheese': 23.596248039592187, 'bacon': 7.179627618338529, 'fries': 6.995001923096951, 'burger': 6.554524076328362, 'amp': 3.120674068204275, 'mac': 3.107978258661805, 'burger king': 2.8467188239304897, 'french fries': 2.131670943025498, 'good': 1.7690656865241219, 'swiss cheese': 1.494800986126786, 'dick cheese': 1.3379525322003034, 'kevin bacon': 1.2977106730213792, 'cream cheese': 1.2731744802612175, 'people': 1.205777587425598, 'chuck e. cheese': 1.2019863785355152, 'american cheese': 1.151556513707664, 'cheese pizza': 1.080499604057313, 'chicken': 1.027264346856799, 'bacon egg': 1.012400277354275, 'fries tonight': 0.9999999999999998, 'te amo burger king': 0.9999999999999998, 'taco fries tonight': 0.9999999999999998, 'cottage cheese legs': 0.9999999999999998, 'fuck cream cheese': 0.9999999999999998, 'emotional support cheese': 0.9999999999999998, 'goat cheese soup': 0.9999999999999998, 'cottage cheese': 0.9718798021643436, 'dinner': 0.92