In [11]:
import os
from datetime import datetime
import pandas as pd

raw_data_path = '/Users/rajivmovva/data/instagram/nba/'

In [40]:
'''
The directory contains lots of .txt files of the format:
    '2023-11-13_04-43-48_UTC.txt'

Get every file matching this format and read the contents
into a dataframe with two columns:
    1. 'date' - Date + time as datetime object
    2. 'text' - The content of the text file
'''

df_data = {'date': [], 'text': []}
for file in os.listdir(raw_data_path):
    if not file.endswith('.txt'):
        continue

    datetime_str = file.split('_')[0] + '_' + file.split('_')[1]
    datetime = datetime.strptime(datetime_str, '%Y-%m-%d_%H-%M-%S')
    with open(raw_data_path + file, 'r') as f:
        text = f.read()
    df_data['date'].append(datetime)
    # df_data['text'].append(text)
    
    # Replace all whitespace characters with spaces
    df_data['text'].append(' '.join(text.split()))

df = pd.DataFrame(df_data).sort_values(by='date').reset_index(drop=True)

In [44]:
'''
Run spaCy entity extraction on each Instagram caption.

Track all entities as a new column in the dataframe.
But only keep entities with the following labels: {PERSON, GPE}
'''
def extract_entities_batch(captions, include_labels=set(['PERSON', 'GPE'])):
    # Load spaCy English model
    nlp = spacy.load("en_core_web_sm")

    # Process captions in batch
    docs = list(nlp.pipe(captions))

    # Extract entities for each document in the batch
    entities_batch = [
        [(ent.text, ent.label_) for ent in doc.ents]
        for doc in docs
    ]

    # Only retain entities with the specified labels
    entities_batch = [
        [ent[0] for ent in entities if ent[1] in include_labels]
        for entities in entities_batch
    ]

    return entities_batch

df['spaCy_entities'] = extract_entities_batch(df['text'])


In [50]:
# Most common Spacy entities
from collections import Counter
entities = [ent for ents in df['spaCy_entities'] for ent in ents]
display(Counter(entities).most_common(20))

[('@wemby', 52),
 ('LeBron', 42),
 ('KD', 40),
 ('Las Vegas', 40),
 ('@tyresehaliburton', 35),
 ('@shai', 31),
 ('Vegas', 31),
 ('LA', 23),
 ('Wemby', 21),
 ('Steph', 20),
 ('Milwaukee', 20),
 ('Group Play', 20),
 ('Joker', 17),
 ('🏆', 17),
 ('LeBron James', 17),
 ('NBAXmas', 17),
 ('Denver', 16),
 ('NBACelebRow', 16),
 ('Sacramento', 15),
 ('Luka', 13)]

In [42]:
import spacy

def extract_entities_batch(captions):
    # Load spaCy English model
    nlp = spacy.load("en_core_web_sm")

    # Process captions in batch
    docs = list(nlp.pipe(captions))

    # Extract entities for each document in the batch
    entities_batch = [
        [(ent.text, ent.label_) for ent in doc.ents]
        for doc in docs
    ]

    return entities_batch

# Example list of Instagram captions
instagram_captions = df['text'].tolist()[1500:1600]

# Extract entities for the entire batch
result_batch = extract_entities_batch(instagram_captions)

# Print the extracted entities for each caption in the batch
for i, entities in enumerate(result_batch):
    print(f"Entities for caption {i + 1}:")
    print(instagram_captions[i], '\n')
    for entity, label in entities:
        print(f"{entity}: {label}")
    print()


Entities for caption 1:
@theanthonyedwards_ ELEVATES for the big rejection! 🏆 NBA In-Season Tournament 🏀 West Group C action on NBA App 

ELEVATES: PERSON
NBA: ORG
West Group C: ORG
NBA App: ORG

Entities for caption 2:
That’s tough, @tyresehaliburton 😤 🏆 NBA In-Season Tournament 🏀 East Group A action on NBA App 

@tyresehaliburton 😤 🏆: PERSON
NBA: ORG
East Group: ORG
NBA App: ORG

Entities for caption 3:
Friday night vibes are high! 

Friday night: TIME

Entities for caption 4:
@fredvanvleet to @jalen ‼️ 🏆 NBA In-Season Tournament 🏀 West Group B action on NBA App 

@jalen ‼️ 🏆: PERSON
NBA: ORG
West Group: ORG
NBA App: ORG

Entities for caption 5:
Pregame half-court shot with Dad ❤️💙 

half: CARDINAL

Entities for caption 6:
@nyknicks clutch buckets LATE on ESPN! 

ESPN: ORG

Entities for caption 7:
@andrew.nembhard finds @brucebrown in transition! 🏆 NBA In-Season Tournament 🏀 East Group A action on the NBA App 

NBA: ORG
East Group: ORG

Entities for caption 8:
our PA announcer Troy P