In [6]:
# connect to mongoDB

from pymongo import MongoClient

MONGO_URI = "" # connection URI here

client = MongoClient(MONGO_URI)

client.list_database_names()

['raw-scrape-data', 'admin', 'local']

In [28]:
# select database and collection

db = client["raw-scrape-data"]     # change to yours
collection = db["tiktok-user-data"]         # change to yours

collection.count_documents({})


7415

In [29]:
# load data into pandas (excluding the fields related to the scrape run)
# this is not limited - lading all records in

import pandas as pd

cursor = collection.find(
    {},
    {
        "_id": 0,
        "username": 1,
        "video_id": 1,
        "caption": 1,
        "timestamp": 1,
        "duration_sec": 1,
        "view_count": 1,
        "like_count": 1,
        "comment_count": 1,
        "repost_count": 1,
        "hashtags": 1
    }
)

docs = list(cursor)
df = pd.DataFrame(docs)

df.head()

Unnamed: 0,username,video_id,caption,timestamp,duration_sec,view_count,like_count,comment_count,repost_count,hashtags
0,visitdubai,7601573860256894228,our beautiful beautiful city #Dubai #VisitDuba...,1769879340,21.0,1276,98,6,3,"dubai,visitdubai,fyp"
1,visitdubai,7600024923301530900,200km/h through the sky. Ciel Dubai Marina jus...,1769518701,51.0,1175,48,2,2,"xdubai,visitdubai,dubai,thefirstgroup"
2,visitdubai,7599717255353552148,Dubai but make it cinematic ‚ù§Ô∏è‚Äçüî• #VisitDubai #...,1769447068,15.0,1717,118,4,4,"visitdubai,dubai,fyp"
3,visitdubai,7597750555636813076,2016 ü§† #VisitDubai #Dubai #fyp,1768989160,60.0,51200,5977,42,497,"visitdubai,dubai,fyp"
4,visitdubai,7596781431444065556,lucky me ‚ò∫Ô∏è #VisitDubai #Dubai #fyp,1768763520,12.0,2215,109,5,5,"visitdubai,dubai,fyp"


In [30]:
# Convert timestamp to posted date/time

df["timestamp"] = pd.to_datetime(df["timestamp"], unit="s", errors="coerce")
df.rename(columns={"timestamp": "posted_date_time"}, inplace=True)

df.head()


Unnamed: 0,username,video_id,caption,posted_date_time,duration_sec,view_count,like_count,comment_count,repost_count,hashtags
0,visitdubai,7601573860256894228,our beautiful beautiful city #Dubai #VisitDuba...,2026-01-31 17:09:00,21.0,1276,98,6,3,"dubai,visitdubai,fyp"
1,visitdubai,7600024923301530900,200km/h through the sky. Ciel Dubai Marina jus...,2026-01-27 12:58:21,51.0,1175,48,2,2,"xdubai,visitdubai,dubai,thefirstgroup"
2,visitdubai,7599717255353552148,Dubai but make it cinematic ‚ù§Ô∏è‚Äçüî• #VisitDubai #...,2026-01-26 17:04:28,15.0,1717,118,4,4,"visitdubai,dubai,fyp"
3,visitdubai,7597750555636813076,2016 ü§† #VisitDubai #Dubai #fyp,2026-01-21 09:52:40,60.0,51200,5977,42,497,"visitdubai,dubai,fyp"
4,visitdubai,7596781431444065556,lucky me ‚ò∫Ô∏è #VisitDubai #Dubai #fyp,2026-01-18 19:12:00,12.0,2215,109,5,5,"visitdubai,dubai,fyp"


In [31]:
# create a copy to preserve the raw data frame

df_raw = df.copy()
df = df_raw.copy()


In [32]:
# hashtags into a list for analysis later

df["hashtags_list"] = (
    df["hashtags"]
    .fillna("")
    .astype(str)
    .str.lower()
    .str.replace(r"\s+", "", regex=True)
    .str.split(",")
    .apply(lambda lst: [t for t in lst if t])
)

df.head()

Unnamed: 0,username,video_id,caption,posted_date_time,duration_sec,view_count,like_count,comment_count,repost_count,hashtags,hashtags_list
0,visitdubai,7601573860256894228,our beautiful beautiful city #Dubai #VisitDuba...,2026-01-31 17:09:00,21.0,1276,98,6,3,"dubai,visitdubai,fyp","[dubai, visitdubai, fyp]"
1,visitdubai,7600024923301530900,200km/h through the sky. Ciel Dubai Marina jus...,2026-01-27 12:58:21,51.0,1175,48,2,2,"xdubai,visitdubai,dubai,thefirstgroup","[xdubai, visitdubai, dubai, thefirstgroup]"
2,visitdubai,7599717255353552148,Dubai but make it cinematic ‚ù§Ô∏è‚Äçüî• #VisitDubai #...,2026-01-26 17:04:28,15.0,1717,118,4,4,"visitdubai,dubai,fyp","[visitdubai, dubai, fyp]"
3,visitdubai,7597750555636813076,2016 ü§† #VisitDubai #Dubai #fyp,2026-01-21 09:52:40,60.0,51200,5977,42,497,"visitdubai,dubai,fyp","[visitdubai, dubai, fyp]"
4,visitdubai,7596781431444065556,lucky me ‚ò∫Ô∏è #VisitDubai #Dubai #fyp,2026-01-18 19:12:00,12.0,2215,109,5,5,"visitdubai,dubai,fyp","[visitdubai, dubai, fyp]"
