In [1]:
import json
import os
import glob
import re
import difflib

import numpy as np
import pandas as pd

import re
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string
import spacy
from gensim import corpora, models, similarities
import gensim
import nltk
import py_stringmatching as sm 
from py_stringmatching.similarity_measure import cosine as cos
import pyLDAvis
%matplotlib inline

import wikipedia
import wikipediaapi
from unidecode import unidecode

import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

#-------
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
import time
from cleantext import clean

### Step 1. Reading Brand/Influencer Profile files folders and Posts JSON files

In [3]:
def read_profile(files):

    # a function to read profile data from profile_files (brand and user)

    """
    [Name]   [Followers]   [Followees]   [Posts]   [URL]   [T/F (you can ignore this field ]   [Category]   [Bio]   [E-mail]   [Phone]   [Profile_pic]

    """


    profile_df = []

    #itarate through the profile files
    for file in files:

        try:


            head, username = os.path.split(file)  #get the filename /username  (teh file name is the username )
            #use pandas to read the file , map each field to its respective headers for eg: col1  represent teh field Name
            prof_info = pd.read_csv(file,delimiter='\t', usecols=[0,1,2,3,4,5,6,7,8,9,10], names=['Name', 'Followers',"Followees",'Posts','URL','T/F','Category','Bio','E-mail','Phone','Profile_pic'], header=None)
            prof_info['username'] = username #add the username to the dataframe
            profile_df.append(prof_info) # save the frame into a list
        except:
            pass




    return pd.concat(profile_df) #concat all the Dataframes and return a single Dataframe with all the profiles

In [4]:

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]






In [5]:
def process__User_nodes(node):

    #helper function to extract usernames from  edge_media_to_tagged_user & edge_media_to_sponsor_user nodes
    #

    res = []
    try:

        for edge in node['edges']:

            try:
                res.append(edge["node"]["user"]["username"])
            except:
                pass
    except:
        pass

    return ",".join(res)


def get_caption(node):

    #helper function to get the caption text
    res = []
    try:
        for edge in node['edges']:
            try:
                res.append(edge["node"]["text"])
            except:
                pass
    except:
        pass

    return ",".join(res)



In [6]:
def process_json(data):
    """

    a function to process the raw json data and normalize the data


    :param data:
    :return:
    """

    dataset = {}
    dataset['edge_media_to_tagged_user'] = process__User_nodes(data.get('edge_media_to_tagged_user',{})) # get a comma separated list of all tagged users
    dataset['edge_media_to_sponsor_user'] = process__User_nodes(data.get('edge_media_to_sponsor_user',{})) # get a comma separated list of all sponserd users
    dataset['edge_media_to_caption'] = get_caption(data.get('edge_media_to_caption',{})) # get the post caption text
    dataset['owner_full_name'] = data.get('owner',{}).get("full_name",None) # get post owners  full name if avilable, Defaults to None
    dataset['owner_username'] = data.get('owner',{}).get("username",None) # get post owners  username if avilable, Defaults to None
    dataset['owner_id'] = data.get('owner',{}).get("id",None) # get post owners id if avilable, Defaults to None
    dataset['edge_media_preview_like'] = data.get('edge_media_preview_like',{}).get("count",None)  # get total likes on the post if avilable, Defaults to None
    dataset['edge_media_to_comment'] = data.get('edge_media_to_comment',{}).get("count",None) # get the total comment count if avilable, Defaults to None
    dataset["Id"] = data.get("id",None) # get the post id if avilable, Defaults to None
    dataset["Is_ad"] = data.get("is_ad",False)  # get the is_ad value  if avilable, Defaults to False
    dataset["Is_video"] = data.get("is_video",False)  # get the is_video value if avilable, Defaults to False
    dataset["Location"] = data.get("location",None)  # get the location data if avilable, Defaults to None
    dataset["json_file"] = data.get("id","")+'.json'  # get the json_file name json_file = [postid].json
    return dataset


def process_post(chunk):
    """

    function to process the post.json files
    a list of postid.json file path is given as input


    :param chunk:
    :return:
    """

    #itarate through each postid.json file
    for post_file in chunk:

        post_data = json.loads(open(post_file,'r',encoding='utf-8-sig').read()) #read the json file into post_data variable
        try:
            yield process_json(post_data)
            # use process_json function to process the raw json data and yield the result from here
            # a generater is used here because we dont want to keep the large data dict in memory
        except Exception as e:
            print(e)

        post_data = None #memory optimization





converting profile and brand files to csv files  for faster read - you only need to run this one time
once you have th file in you directory  you may not need to run this  again -> uncomment the block bellow to run it


In [12]:
#get all the files in the users_brands_SPOD and users_influencers_SPOD folders
#read_profile function appends all the data and returns a Dataframe

profile_brands = read_profile(glob.glob('users_brands_SPOD//*')) # get
profile_influencers = read_profile(glob.glob('users_influencers_SPOD//*'))

#save the processed data to csv files for simplicity and time management
# read_profile is a heavey opration and must be avoided whenever we can
profile_brands.to_csv('profile_brands.csv',index=False)
profile_influencers.to_csv('profile_influencers.csv',index=False)

# you should keep this block commented out once you have full data in the csv file
# this block should only be uncommented if you have new profile data to add  and it should be commented back once you have the new csv







In [7]:
profile_brands = pd.read_csv('profile_brands.csv').add_prefix('Brand_') # load the brands csv file and give the coloumns a 'Brand_' prefix to identify between brand and  influencer profiles
profile_brands['Brand_username'] = '@'+profile_brands['Brand_username'].str.lower() #add @ to the username
profile_influencers = pd.read_csv('profile_influencers.csv').add_prefix('Influencer_')# load the influencers csv file and give the coloumns a 'Influencer_' prefix to identify between brand and  influencer profile


#### Add ID Columns

In [8]:
profile_brands.index.name
profile_brands.index.name = 'brand_id'
profile_brands = profile_brands.reset_index(level=0)
profile_brands

Unnamed: 0,brand_id,Brand_Name,Brand_Followers,Brand_Followees,Brand_Posts,Brand_URL,Brand_T/F,Brand_Category,Brand_Bio,Brand_E-mail,Brand_Phone,Brand_Profile_pic,Brand_username
0,0,Bahama Buck's,27917.0,50.0,1967.0,http://www.bahamabucks.mybigcommerce.com/,True,Restaurants,The Greatest Sno on Earth! Tag your photos wit...,fans@bahamabucks.com,1.888383e+10,https://scontent-lax3-1.cdninstagram.com/vp/94...,@bahamabucks
1,1,Erica Hoida • Fashioned|Chic,1039994.0,669.0,1364.0,http://www.fashionedchic.com/2018/10/04/classi...,True,Creators & Celebrities,👠 Shoe obsessed in San Diego 📧 Projects@Fashi...,projects@fashionedchic.com,,https://scontent-lax3-1.cdninstagram.com/vp/13...,@fashionedchicstyling
2,2,Body & Fit,83167.0,246.0,758.0,http://bodyenf.it/popeyes-pancakes,True,Personal Goods & General Merchandise Stores,"Jouw dagelijkse dosis motivatie, gezonde recep...",info@bodyenfitshop.nl,3.151368e+10,https://scontent-lax3-1.cdninstagram.com/vp/9b...,@bodyenfitshopnl
3,3,The Skin Agency,14734.0,77.0,315.0,http://theskinagency.com/,True,Personal Goods & General Merchandise Stores,| Laser Hair Removal | Injections | Facials ...,info@theskinagency.com,1.818309e+10,https://scontent-lax3-1.cdninstagram.com/vp/84...,@theskinagency
4,4,Carl Bembridge Celebrity Hair,47991.0,3332.0,1227.0,https://youtu.be/9EWVEJrOZdY,True,Lifestyle Services,▪️Celebrity Hairdresser & Wigmaker Extraordina...,info@carlbembridgehair.co.uk,,https://scontent-lax3-1.cdninstagram.com/vp/9d...,@carlbembridgehair
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25276,25276,Le Lis Blanc,1379408.0,48.0,6965.0,http://bit.ly/lelis_shop,True,Personal Goods & General Merchandise Stores,Le Lis Blanc #lelisblanc | Atelier Le Lis #ate...,,5.511215e+11,https://scontent-lax3-1.cdninstagram.com/vp/f0...,@lelisblanc
25277,25277,Tielle Love Luxury,1894.0,815.0,541.0,https://www.tielleloveluxury.co.uk/blog/simple...,True,Personal Goods & General Merchandise Stores,Leading provider of luxury linens to the world...,enquiries@tielleloveluxury.co.uk,4.414428e+11,https://scontent-lax3-1.cdninstagram.com/vp/b2...,@tielleloveluxury
25278,25278,Wingman®,21984.0,2259.0,283.0,http://wingmanapp.com/,True,Content & Apps,It's not about you. Set up your single friend ...,info@awingman.com,,https://scontent-lax3-1.cdninstagram.com/vp/aa...,@wingman_app
25279,25279,Pirate's Booty,6888.0,60.0,574.0,http://bit.ly/2vdFjJz,True,Home Services,Ahoy matey! You’ve discovered a deliciously ba...,piratessocial@bgfoods.com,,https://scontent-lax3-1.cdninstagram.com/vp/a0...,@piratesbooty


In [9]:
profile_influencers.index.name
profile_influencers.index.name = 'influencer_id'
profile_influencers = profile_influencers.reset_index(level=0)
profile_influencers

Unnamed: 0,influencer_id,Influencer_Name,Influencer_Followers,Influencer_Followees,Influencer_Posts,Influencer_URL,Influencer_T/F,Influencer_Category,Influencer_Bio,Influencer_E-mail,Influencer_Phone,Influencer_Profile_pic,Influencer_username
0,0,@twogoodtoresist,21147,1522,1587,,False,,"😋Our motto: Come Hungry, Leave Happy📍Melbourne...",,,https://scontent-lax3-1.cdninstagram.com/vp/d6...,twogoodtoresist
1,1,Laura,35796,915,640,https://www.girlxdeparture.com/things-to-do-al...,True,Creators & Celebrities,"💫 Life lover, dog stalker & globetrotter 💌 Ma...",contact@girlxdeparture.com,,https://scontent-lax3-1.cdninstagram.com/vp/61...,girlxdeparture
2,2,Emma,76449,451,1141,https://emmaappletonphotography.carbonmade.com/,True,Creators & Celebrities,21✨🌿🍁 london • oxford 🎓 daily musings of an ox...,musingsofemma@gmail.com,,https://scontent-lax3-1.cdninstagram.com/vp/01...,musingsofemma
3,3,Lindsey Danielle Hitch,4087,1046,291,,True,General Interest,Jesus. Tennessee 🌸 be the light. follow my be...,lindsey_hitch@yahoo.com,,https://scontent-lax3-1.cdninstagram.com/vp/6d...,lindsey_hitch
4,4,🌼 Paola Bari⠀🌼,2205,1054,185,,True,Creators & Celebrities,Healthy Food & Vanity Blogger🍴👠💄 🇮🇹 Vicenza ⠀ ...,paola.bari@outlook.it,,https://scontent-lax3-1.cdninstagram.com/vp/4a...,paola_b_a_r_i_
...,...,...,...,...,...,...,...,...,...,...,...,...,...
38099,38099,Anna Blanch Rabe,1106,419,6146,https://linktr.ee/annie_rabe,True,Creators & Celebrities,🖋Writer. 🎤TEDx Speaker. ⚖️Attorney. 💼CEO. 💍Mil...,anna@annablanchrabe.com,1.575446e+10,https://scontent-lax3-1.cdninstagram.com/vp/a1...,annie_rabe
38100,38100,The Fashion Cherry Diary,22179,4453,608,https://thefashioncherrydiary.com/benvenuta-mi...,True,Creators & Celebrities,Isabella #Fashion&Lifestyle #over40 Image cons...,isabella@thefashioncherrydiary.com,,https://scontent-lax3-1.cdninstagram.com/vp/ae...,thefashioncherrydiary
38101,38101,ana.maria.darie (Charlotte),1685,856,1477,,False,,•Sales & Marketing; •ℙℝ Specialist •𝕋𝕍 ℙ𝕣odu...,,,https://scontent-lax3-1.cdninstagram.com/vp/6e...,ana_maria_darie
38102,38102,David 🇩🇪,2560,516,268,,True,General Interest,Deutsche Detailer Aeronautical engineering stu...,dar1898@thi.de,,https://scontent-lax3-1.cdninstagram.com/vp/02...,scalemodels_ge


#### Filter Brands that have > than 1.5M Followers and < than 1.6M Followers

In [10]:
profile_brands = profile_brands[profile_brands['Brand_Followers'].between(1500000,1600000)]
profile_brands

Unnamed: 0,brand_id,Brand_Name,Brand_Followers,Brand_Followees,Brand_Posts,Brand_URL,Brand_T/F,Brand_Category,Brand_Bio,Brand_E-mail,Brand_Phone,Brand_Profile_pic,Brand_username
257,257,bilou • made with love,1589014.0,12.0,226.0,http://www.bilou.de/,True,Personal Goods & General Merchandise Stores,🇩🇪 dm | Rossmann | Müller | Budni 🇦🇹 BIPA | dm...,kontakt@bilou.de,,https://scontent-lax3-1.cdninstagram.com/vp/9f...,@mybilou
1190,1190,OnePlus,1593500.0,98.0,2280.0,http://onepl.us/6T_launchin,True,Personal Goods & General Merchandise Stores,Take your photography to the next level with t...,shoton@oneplus.com,,https://scontent-lax3-1.cdninstagram.com/vp/82...,@oneplus
1264,1264,Rogue Fitness,1510628.0,240.0,3586.0,https://www.roguefitness.com/gear-apparel/new-...,True,Personal Goods & General Merchandise Stores,The leading provider of American made strength...,team@roguefitness.com,16143590000.0,https://scontent-lax3-1.cdninstagram.com/vp/91...,@roguefitness
1279,1279,Bergdorf Goodman,1529651.0,878.0,7668.0,http://like2b.uy/bergdorfs/,True,Personal Goods & General Merchandise Stores,There is only one Bergdorf Goodman. Shop @Berg...,,18009670000.0,https://scontent-lax3-1.cdninstagram.com/vp/fc...,@bergdorfs
1599,1599,McClure Twins - Ava and Alexis,1582795.0,415.0,553.0,https://youtu.be/K4wNuLJDaOw,False,,- Forbes Top 10 Influencer - inquiries: themc...,,,https://scontent-lax3-1.cdninstagram.com/vp/61...,@mccluretwins
2320,2320,Glossier,1502162.0,3491.0,2801.0,http://gls.sr/shop-Glossier-here,True,Personal Goods & General Merchandise Stores,Skin first. Makeup second. 😀👋,gteam@glossier.com,,https://scontent-lax3-1.cdninstagram.com/vp/9d...,@glossier
2406,2406,CELINE,1518436.0,0.0,43.0,http://CELINE.COM/,True,Business & Utility Services,,,,https://scontent-lax3-1.cdninstagram.com/vp/0b...,@celine
2836,2836,hotmiamistyles,1597934.0,1059.0,10842.0,http://www.HotMiamiStyles.com/,True,Personal Goods & General Merchandise Stores,The Official Instagram for Hot Miami Styles ®,info@hotmiamistyles.com,18664640000.0,https://scontent-lax3-1.cdninstagram.com/vp/9f...,@hotmiamistyles
3147,3147,Shawn Johnson East,1561500.0,2732.0,2628.0,https://youtu.be/EEMmN7miHAg,True,Creators & Celebrities,Wife to hottie @andrewdeast Olympian. Youtube...,shawnjohnsonmgmt@gmail.com,,https://scontent-lax3-1.cdninstagram.com/vp/11...,@shawnjohnson
3190,3190,Robbie Williams,1544725.0,40.0,1214.0,https://robbiewilliams.lnk.to/revealIN,True,Creators & Celebrities,Life Thru A Lens,help@robbiewilliams.com,,https://scontent-lax3-1.cdninstagram.com/vp/34...,@robbiewilliams


In [12]:
#use pandas to read the post_info.txt file , map each field to its respective headers for eg: col1  represent the field username
post_info = pd.read_csv('post_info.txt',delimiter='\t', usecols=[1,2,3], names=['username', 'Sponsorship_label',"json_file"], header=None)
# get all posts by  users specified in the 'usernames_to_check'  list
# keep only the posts where 'Sponsorship_label' equals '1'
post_info = post_info[(post_info['Sponsorship_label'] == 1)]


#### Read JSON files

In [18]:
#read json files from json_file folder
json_files = [f"json_file//{x.strip()}" for x in post_info['json_file'].tolist()]

with open('all_sponserd_posts.json','w',encoding='utf-8-sig') as myjsonfile:
    json.dump([x for x in process_post(json_files)],myjsonfile) #merge all the json files into one file for faster acces next time

# you should keep this block commented out once you have full data in the json file
# this block should only be uncommented if you have new post files to add  and it should be commented back once you have the all_sponserd_posts.json file


In [13]:
chunk_df = pd.DataFrame(json.loads(open('all_sponserd_posts.json','r',encoding='utf-8-sig').read()))#use the all_sponserd_posts.json file for faster access



In [14]:
chunk_df = chunk_df.merge(post_info,on="json_file",how='left')   # merge the new dataframe with post_info Dataframe based on the "json_file" coloumn present in both the dataframes


In [15]:
chunk_df 

Unnamed: 0,edge_media_to_tagged_user,edge_media_to_sponsor_user,edge_media_to_caption,owner_full_name,owner_username,owner_id,edge_media_preview_like,edge_media_to_comment,Id,Is_ad,Is_video,Location,json_file,username,Sponsorship_label
0,,,Got my #GrazeBox again from @grazeusa\nI love ...,,,322123222,914,3,1309041812857818435,False,False,,1309041812857818435.json,alisasia,1
1,,,@kraft_macandcheese takes only 10 min to prepa...,,,322123222,899,3,1476490497539529413,False,False,,1476490497539529413.json,alisasia,1
2,,,💐👛 @rosegal_official #ad,,,322123222,1112,0,1606115486206804899,False,False,,1606115486206804899.json,alisasia,1
3,,,,Alisa Sia,alisasia,322123222,2406,7,1606130952468007792,False,False,"{'address_json': '{""street_address"": """", ""zip_...",1606130952468007792.json,alisasia,1
4,,,Tnx @livingroyal #livingroyal #ad,,,322123222,1206,0,1651957573904970960,False,False,,1651957573904970960.json,alisasia,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
221705,,,"It's World Smile Day tomorrow, (Friday, Octobe...",,,2965172,1026,4,1883076195117984662,False,False,,1883076195117984662.json,thefashionpoet,1
221706,,,"Pre-social media days, pre-blog days, we were ...",,,2965172,1251,27,1884326074935014613,False,False,,1884326074935014613.json,thefashionpoet,1
221707,,,Love is the answer ♥️.Got this custom designed...,,,2965172,1082,17,1890259046011829896,False,False,,1890259046011829896.json,thefashionpoet,1
221708,,,Love my rainbow 🌈 bikini from @southmoonunder....,,,2965172,1527,38,1989242405811088071,False,False,,1989242405811088071.json,thefashionpoet,1


### Step 2. Finding Posts that mention selected Brands in Captions

In [16]:
def get_all_usernames_from_tokens(edge_media_to_caption_tokes):

    # check if token array is empty or not if empty  return null
    if len(edge_media_to_caption_tokes) == 0:
        return None


    matched_brands = []  #to hold matched brands

    #itarate through all the tokens
    for token in edge_media_to_caption_tokes:
        b = str(token).strip().lower() #convert each token to lowercase and strip whitespaces for a standerdized form
        if b.startswith('@'): # if a token starts with "@" we can  safely assume its a username
            matched_brands.append(token) # add the token to the matched list

    if len(matched_brands) == 0:
        # there is no matched brands then return None
        return None
    else:
        #remove duplicates and return matched list
        return list(set(matched_brands))

def tokenize_str(string):
    #split string by whitespace remove all special chars Except
    string = str(string).lower() # convert the text into lowercase for easier matching
    string = string.replace('#'," #").replace('\n'," \n") # replace all "#" values with " #" (whitespace + #) otherwise this would give a false match
    return re.sub("[^\w@ ]", "", string).split()  #split string by whitespace remove all special chars Except @

  return re.sub("[^\w@ ]", "", string).split()  #split string by whitespace remove all special chars Except @


In [17]:
chunk_df['edge_media_to_caption_tokes'] =  chunk_df['edge_media_to_caption'].apply(tokenize_str,lambda x:x)  # tokanize the edge_media_to_caption

In [18]:

chunk_df['brand_matched'] = chunk_df['edge_media_to_caption_tokes'].apply(lambda x:get_all_usernames_from_tokens(x)) # call get_all_usernames_from_tokens function to get all usernames from caption





In [19]:
chunk_df = chunk_df.explode('brand_matched')

In [20]:
chunk_df.dropna(subset=['brand_matched'],inplace=True) #remove all rows where brand_matched is null
del chunk_df['edge_media_to_caption_tokes'] # delete edge_media_to_caption_tokes - > we dont need this anymore


chunk_df = chunk_df[chunk_df['brand_matched'].isin(profile_brands['Brand_username'])] #check if the matched brand is in the brands_to_check list


### Step 3. Join Posts with Brand Profiles and Influencer Profiles

In [21]:
# join the new dataframe with profile_brands Dataframe,  keep all records where 'brand_matched' from the new df  equals 'Brand_username' on the profile_brands Dataframe
chunk_df = chunk_df.merge(profile_brands,left_on='brand_matched',right_on='Brand_username',how='left')

# join the new dataframe with profile_influencers Dataframe,  keep all records where 'username' from the new df  equals 'Influencer_username' on the profile_influencers Dataframe
chunk_df = chunk_df.merge(profile_influencers,left_on='username',right_on='Influencer_username',how='left')

In [22]:
chunk_df # print the final dataframe

Unnamed: 0,edge_media_to_tagged_user,edge_media_to_sponsor_user,edge_media_to_caption,owner_full_name,owner_username,owner_id,edge_media_preview_like,edge_media_to_comment,Id,Is_ad,...,Influencer_Followees,Influencer_Posts,Influencer_URL,Influencer_T/F,Influencer_Category,Influencer_Bio,Influencer_E-mail,Influencer_Phone,Influencer_Profile_pic,Influencer_username
0,,,New Playground 🌳 New Outfit 🏃🏼‍♀️ #makewavesmo...,,,2115809609,2539,22,1855948616494059747,False,...,436,559,https://www.savage-shop.com/shop/clothing/tops...,True,Creators & Celebrities,SALT in my eyes| WATER in my nose| CORAL in my...,maylakind@gmail.com,,https://scontent-lax3-1.cdninstagram.com/vp/43...,mayla_kind
1,,,💥 @amazon #amazonfashion #fashionblogger #revi...,,cherixxo,1600914211,131,7,1856137461031744929,False,...,925,743,http://www.cheriglowcosmetics.com/,False,,Eternal dreamer. Maktub مكتوب Paz y amor. ✨ 👻...,,,https://scontent-lax3-1.cdninstagram.com/vp/88...,cherixxo
2,,,🐊 @amazon #reviews #ad #spon,,cherixxo,1600914211,125,4,1856137870882288814,False,...,925,743,http://www.cheriglowcosmetics.com/,False,,Eternal dreamer. Maktub مكتوب Paz y amor. ✨ 👻...,,,https://scontent-lax3-1.cdninstagram.com/vp/88...,cherixxo
3,,,✨ @amazon #amazonfashion #fashionblogger #revi...,,,1600914211,154,5,1858196895417397646,False,...,925,743,http://www.cheriglowcosmetics.com/,False,,Eternal dreamer. Maktub مكتوب Paz y amor. ✨ 👻...,,,https://scontent-lax3-1.cdninstagram.com/vp/88...,cherixxo
4,,,✨ @amazon #amazonfashion #fashionblogger #styl...,,,1600914211,114,3,1858368831405513732,False,...,925,743,http://www.cheriglowcosmetics.com/,False,,Eternal dreamer. Maktub مكتوب Paz y amor. ✨ 👻...,,,https://scontent-lax3-1.cdninstagram.com/vp/88...,cherixxo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,thebodyshop,,"Let’s talk about skin care, I have skin that i...",W I T H F A R R A R,withfarrar,1534888077,444,54,1871982045535989955,False,...,900,323,https://linktr.ee/withfarrar,True,Creators & Celebrities,K E I R A F A R R A R 👋🏼 🌵 You'll see a lot of...,withfarrar@gmail.com,,https://scontent-lax3-1.cdninstagram.com/vp/83...,withfarrar
1582,thebodyshop,,"🎄 ALLOW IT, it’s December! And the whole of De...",W I T H F A R R A R,withfarrar,1534888077,410,101,1926452264537680041,False,...,900,323,https://linktr.ee/withfarrar,True,Creators & Celebrities,K E I R A F A R R A R 👋🏼 🌵 You'll see a lot of...,withfarrar@gmail.com,,https://scontent-lax3-1.cdninstagram.com/vp/83...,withfarrar
1583,,,@shannan.rn - @thescoopie 🙆 is such a clever ...,,,206368670,54,1,2002404708060326850,False,...,182,2401,http://amzn.to/2MAVcBv,True,Personal Goods & General Merchandise Stores,💥Portable POWDER Dispenser 💪Simplify Supplemen...,jarredallen@thescoopie.com,1.512700e+10,https://scontent-lax3-1.cdninstagram.com/vp/4d...,thescoopie
1584,,,Ignite your senses with the oriental scent of ...,,,25552890,758,9,1799101971636188097,False,...,398,218,http://thedailyxoxot.blogspot.com/,True,Creators & Celebrities,You can’t always get what you want🥀 But if you...,email@me.com,,https://scontent-lax3-1.cdninstagram.com/vp/b5...,xoxo_tiana_xoxo


In [23]:
chunk_df.to_csv('final_results.csv',index=False)

In [24]:
df = pd.read_csv('final_results.csv')

In [25]:
df

Unnamed: 0,edge_media_to_tagged_user,edge_media_to_sponsor_user,edge_media_to_caption,owner_full_name,owner_username,owner_id,edge_media_preview_like,edge_media_to_comment,Id,Is_ad,...,Influencer_Followees,Influencer_Posts,Influencer_URL,Influencer_T/F,Influencer_Category,Influencer_Bio,Influencer_E-mail,Influencer_Phone,Influencer_Profile_pic,Influencer_username
0,,,New Playground 🌳 New Outfit 🏃🏼‍♀️ #makewavesmo...,,,2115809609,2539,22,1855948616494059747,False,...,436,559,https://www.savage-shop.com/shop/clothing/tops...,True,Creators & Celebrities,SALT in my eyes| WATER in my nose| CORAL in my...,maylakind@gmail.com,,https://scontent-lax3-1.cdninstagram.com/vp/43...,mayla_kind
1,,,💥 @amazon #amazonfashion #fashionblogger #revi...,,cherixxo,1600914211,131,7,1856137461031744929,False,...,925,743,http://www.cheriglowcosmetics.com/,False,,Eternal dreamer. Maktub مكتوب Paz y amor. ✨ 👻...,,,https://scontent-lax3-1.cdninstagram.com/vp/88...,cherixxo
2,,,🐊 @amazon #reviews #ad #spon,,cherixxo,1600914211,125,4,1856137870882288814,False,...,925,743,http://www.cheriglowcosmetics.com/,False,,Eternal dreamer. Maktub مكتوب Paz y amor. ✨ 👻...,,,https://scontent-lax3-1.cdninstagram.com/vp/88...,cherixxo
3,,,✨ @amazon #amazonfashion #fashionblogger #revi...,,,1600914211,154,5,1858196895417397646,False,...,925,743,http://www.cheriglowcosmetics.com/,False,,Eternal dreamer. Maktub مكتوب Paz y amor. ✨ 👻...,,,https://scontent-lax3-1.cdninstagram.com/vp/88...,cherixxo
4,,,✨ @amazon #amazonfashion #fashionblogger #styl...,,,1600914211,114,3,1858368831405513732,False,...,925,743,http://www.cheriglowcosmetics.com/,False,,Eternal dreamer. Maktub مكتوب Paz y amor. ✨ 👻...,,,https://scontent-lax3-1.cdninstagram.com/vp/88...,cherixxo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1581,thebodyshop,,"Let’s talk about skin care, I have skin that i...",W I T H F A R R A R,withfarrar,1534888077,444,54,1871982045535989955,False,...,900,323,https://linktr.ee/withfarrar,True,Creators & Celebrities,K E I R A F A R R A R 👋🏼 🌵 You'll see a lot of...,withfarrar@gmail.com,,https://scontent-lax3-1.cdninstagram.com/vp/83...,withfarrar
1582,thebodyshop,,"🎄 ALLOW IT, it’s December! And the whole of De...",W I T H F A R R A R,withfarrar,1534888077,410,101,1926452264537680041,False,...,900,323,https://linktr.ee/withfarrar,True,Creators & Celebrities,K E I R A F A R R A R 👋🏼 🌵 You'll see a lot of...,withfarrar@gmail.com,,https://scontent-lax3-1.cdninstagram.com/vp/83...,withfarrar
1583,,,@shannan.rn - @thescoopie 🙆 is such a clever ...,,,206368670,54,1,2002404708060326850,False,...,182,2401,http://amzn.to/2MAVcBv,True,Personal Goods & General Merchandise Stores,💥Portable POWDER Dispenser 💪Simplify Supplemen...,jarredallen@thescoopie.com,1.512700e+10,https://scontent-lax3-1.cdninstagram.com/vp/4d...,thescoopie
1584,,,Ignite your senses with the oriental scent of ...,,,25552890,758,9,1799101971636188097,False,...,398,218,http://thedailyxoxot.blogspot.com/,True,Creators & Celebrities,You can’t always get what you want🥀 But if you...,email@me.com,,https://scontent-lax3-1.cdninstagram.com/vp/b5...,xoxo_tiana_xoxo


### Step4. Group by brand Names and keep columns with attributes useful for KG

In [26]:
g = df.groupby(['brand_matched']).apply(lambda x: x.nlargest(20,['Influencer_Followers'])).reset_index(drop=True)

In [27]:
df_final = g[['brand_id','Brand_Name','Brand_Followers','Brand_Followees','Brand_Posts','Brand_URL','Brand_Category','Id','edge_media_to_caption','edge_media_preview_like','edge_media_to_comment','influencer_id','Influencer_Name','Influencer_Followers','Influencer_Followees','Influencer_Posts','Influencer_Category']]
df_final

Unnamed: 0,brand_id,Brand_Name,Brand_Followers,Brand_Followees,Brand_Posts,Brand_URL,Brand_Category,Id,edge_media_to_caption,edge_media_preview_like,edge_media_to_comment,influencer_id,Influencer_Name,Influencer_Followers,Influencer_Followees,Influencer_Posts,Influencer_Category
0,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1877001634463122445,Eu e meu chamego de sempre passando pelo seu f...,46572,236,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
1,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1891646163346665904,"Minha cara quando falam: ""Lore, tem sobremesa""...",54989,286,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
2,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1916289352699275991,Tô de olho em você que ainda não seguiu minha ...,67286,400,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
3,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1936582221779537401,Pq se nao tiver essa cara vocês sabem que não ...,61472,370,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
4,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1961799656992243034,Minha cara para quem fala que ainda não compro...,38757,289,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,7850,Wish,1589576.0,162.0,3923.0,https://www.wish.com/j/IG_UGC_wk23,Content & Apps,1893276039070046741,In case you wanted to see the amazing goodies ...,745,32,7392,Tiffany 🌹 - Beauty Influencer,19260,6305,184,Creators & Celebrities
438,7850,Wish,1589576.0,162.0,3923.0,https://www.wish.com/j/IG_UGC_wk23,Content & Apps,1826358290554904977,🦋WHITE CHERRY BLOSSOMS🦋\n»Werbung«\nMeinen Hoc...,1038,33,589,🌸 P I N V R 🌸,17938,895,147,Creators & Celebrities
439,7850,Wish,1589576.0,162.0,3923.0,https://www.wish.com/j/IG_UGC_wk23,Content & Apps,1893069380612570908,It’s double the happiness when I have two of m...,1099,75,32928,Annie Cho,16072,1825,531,Creators & Celebrities
440,7850,Wish,1589576.0,162.0,3923.0,https://www.wish.com/j/IG_UGC_wk23,Content & Apps,1895236499700952257,Sunday sunshine in my latest finds from @wish ...,637,61,12361,"Hi y’all, I’m Brandy G!",15444,1565,1088,Creators & Celebrities


In [28]:
df_final.rename(columns = {'Id':'post_id','edge_media_to_caption':'post_caption', 'edge_media_preview_like':'post_likes','edge_media_to_comment':'post_comments'}, inplace = True)
df_final

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.rename(columns = {'Id':'post_id','edge_media_to_caption':'post_caption', 'edge_media_preview_like':'post_likes','edge_media_to_comment':'post_comments'}, inplace = True)


Unnamed: 0,brand_id,Brand_Name,Brand_Followers,Brand_Followees,Brand_Posts,Brand_URL,Brand_Category,post_id,post_caption,post_likes,post_comments,influencer_id,Influencer_Name,Influencer_Followers,Influencer_Followees,Influencer_Posts,Influencer_Category
0,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1877001634463122445,Eu e meu chamego de sempre passando pelo seu f...,46572,236,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
1,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1891646163346665904,"Minha cara quando falam: ""Lore, tem sobremesa""...",54989,286,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
2,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1916289352699275991,Tô de olho em você que ainda não seguiu minha ...,67286,400,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
3,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1936582221779537401,Pq se nao tiver essa cara vocês sabem que não ...,61472,370,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
4,3209,CREME PARA ESTRIAS,1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores,1961799656992243034,Minha cara para quem fala que ainda não compro...,38757,289,35112,Lorena Improta,5074442,1966,5127,Creators & Celebrities
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437,7850,Wish,1589576.0,162.0,3923.0,https://www.wish.com/j/IG_UGC_wk23,Content & Apps,1893276039070046741,In case you wanted to see the amazing goodies ...,745,32,7392,Tiffany 🌹 - Beauty Influencer,19260,6305,184,Creators & Celebrities
438,7850,Wish,1589576.0,162.0,3923.0,https://www.wish.com/j/IG_UGC_wk23,Content & Apps,1826358290554904977,🦋WHITE CHERRY BLOSSOMS🦋\n»Werbung«\nMeinen Hoc...,1038,33,589,🌸 P I N V R 🌸,17938,895,147,Creators & Celebrities
439,7850,Wish,1589576.0,162.0,3923.0,https://www.wish.com/j/IG_UGC_wk23,Content & Apps,1893069380612570908,It’s double the happiness when I have two of m...,1099,75,32928,Annie Cho,16072,1825,531,Creators & Celebrities
440,7850,Wish,1589576.0,162.0,3923.0,https://www.wish.com/j/IG_UGC_wk23,Content & Apps,1895236499700952257,Sunday sunshine in my latest finds from @wish ...,637,61,12361,"Hi y’all, I’m Brandy G!",15444,1565,1088,Creators & Celebrities


In [29]:
df_final.to_csv('df_final.csv',index=False)

In [30]:
df_final = pd.read_csv('df_final.csv')

-------------------------------------------------------------------

### Create Brand Nodes

In [31]:
wikipedia.set_lang('en')
wikiApi = wikipediaapi.Wikipedia('en')

In [32]:
emoji_pattern  = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)

def remove_emoji(input_string):
    for ele in input_string:
        word = emoji_pattern.sub(r'', input_string) # No emoji
    return word 

In [33]:
def cleanListOfWords(listOfWords):
    listOfWords =  [remove_emoji(i).lower() for i in listOfWords if i != '']
    listOfWords = [re.sub('\S*@\S*\s?', '', word).lower() for word in listOfWords]
    listOfWords = [re.sub(r"[^a-zA-Z0-9 ]", " ", word) for word in listOfWords]
    listOfWords = [re.sub('\s+', ' ', word) for word in listOfWords]
    listOfWords = [re.sub("\'", "", word) for word in listOfWords]
    listOfWords = [re.sub(r'[^\w\s]', '', word) for word in listOfWords]
    listOfWords = [unidecode(word) for word in listOfWords]
    return listOfWords

  listOfWords = [re.sub('\S*@\S*\s?', '', word).lower() for word in listOfWords]
  listOfWords = [re.sub('\s+', ' ', word) for word in listOfWords]


In [34]:
brandNames = list(set(df_final["Brand_Name"]))
brandNames

['Pizza Hut 🍕',
 'FARSÁLI - Beauty with Benefits',
 'CREME PARA ESTRIAS',
 'Massimo Dutti',
 'ROXY',
 'Universal Studios Hollywood',
 'Barneys New York',
 'Rogue Fitness',
 'Philadelphia Eagles',
 'RubyRose_Oficial',
 'Universal Orlando Resort',
 'Moda',
 'The Body Shop Official',
 'Koton',
 'dm-drogerie markt Deutschland',
 'Shawn Johnson East',
 'Gymshark Women',
 'live lokai',
 'QUAY AUSTRALIA',
 'Glossier',
 'Rashida Jones',
 'OnePlus',
 'Bottega Veneta',
 'CELINE',
 'McClure Twins - Ava and Alexis',
 'Official Kylie Jenner Shop',
 'adidas London',
 'Nike Training Club Live',
 'Wish',
 'Bergdorf Goodman',
 'Burger King',
 'TEZENiS',
 'hotmiamistyles',
 'Ludovica Valli',
 'Amazon']

In [35]:
cleanedBrandNames = cleanListOfWords(brandNames)
cleanedBrandNames

['pizza hut ',
 'fars li beauty with benefits',
 'creme para estrias',
 'massimo dutti',
 'roxy',
 'universal studios hollywood',
 'barneys new york',
 'rogue fitness',
 'philadelphia eagles',
 'rubyrose oficial',
 'universal orlando resort',
 'moda',
 'the body shop official',
 'koton',
 'dm drogerie markt deutschland',
 'shawn johnson east',
 'gymshark women',
 'live lokai',
 'quay australia',
 'glossier',
 'rashida jones',
 'oneplus',
 'bottega veneta',
 'celine',
 'mcclure twins ava and alexis',
 'official kylie jenner shop',
 'adidas london',
 'nike training club live',
 'wish',
 'bergdorf goodman',
 'burger king',
 'tezenis',
 'hotmiamistyles',
 'ludovica valli',
 'amazon']

In [36]:
q3_token = sm.QgramTokenizer(qval=3)
cosine = cos.Cosine()
jaccard = sm.Jaccard()

In [37]:
phrasesList = []
for (idx, brand) in enumerate(cleanedBrandNames):
    #get rid of all unecessary words that could skew cosine similarity
    plainBrandName = brandNames[idx]
    phrase = '%20'.join([remove_stopwords(abst) for abst in brand.split(' ')])
    url = f'https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={phrase}&utf8=&format=json'
    jsonPhrase = pd.read_json(url, orient='records')
    
    scores = []
    
    for brandToToken in [brand + '(company)']:
        phraseToken = q3_token.tokenize(brandToToken)
        for result in jsonPhrase['query']['search']:            
            title = result['title'].lower()           
            q3TokenSample = q3_token.tokenize(title)
            cosScoreQ3 = cosine.get_sim_score(q3TokenSample, phraseToken)
            scores.append([result['title'], cosScoreQ3]) 
        
    if('roxy' in brand):
        scores = [["Quicksilver (company)", 1]]
    if('moda' == brand):
        scores = [["Modà", 1.1]]

    if(scores):
        maxScore = max(scores, key=lambda item:item[1])
        phrasesList.append([plainBrandName, maxScore[0]])    
    else:
        phrasesList.append([plainBrandName, ''])    

In [38]:
phrasesList

[['Pizza Hut 🍕', 'Pizza Hut'],
 ['FARSÁLI - Beauty with Benefits', 'Li Yundi'],
 ['CREME PARA ESTRIAS', ''],
 ['Massimo Dutti', 'Massimo Dutti'],
 ['ROXY', 'Quicksilver (company)'],
 ['Universal Studios Hollywood', 'Universal Studios Hollywood'],
 ['Barneys New York', 'Barneys New York'],
 ['Rogue Fitness', 'Rogue Fitness'],
 ['Philadelphia Eagles', 'Philadelphia Eagles'],
 ['RubyRose_Oficial', ''],
 ['Universal Orlando Resort', 'Universal Orlando'],
 ['Moda', 'Modà'],
 ['The Body Shop Official', 'The Body Shop'],
 ['Koton', 'Koton (company)'],
 ['dm-drogerie markt Deutschland', 'EBS Symposium'],
 ['Shawn Johnson East', 'Shawn Johnson East'],
 ['Gymshark Women', 'Gymshark'],
 ['live lokai', 'List of Tales from the Crypt episodes'],
 ['QUAY AUSTRALIA', 'Quay (restaurant)'],
 ['Glossier', 'Glossy ibis'],
 ['Rashida Jones', 'Rashida Jones'],
 ['OnePlus', 'OnePlus'],
 ['Bottega Veneta', 'Bottega Veneta'],
 ['CELINE', 'Celine (brand)'],
 ['McClure Twins - Ava and Alexis', 'McClure twins'],


In [39]:
def lemmatization(texts, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [40]:
def findKeywords(brandName, listOfWords, numWords):
    wikiPage = cleanListOfWords(listOfWords)
    wikiPage = [word for word in wikiPage if word.lower() not in brandName.lower().split(" ")]
    
    absGram = [remove_stopwords(abst) for abst in wikiPage]
    absGramSplit = [remove_stopwords(abst).split(' ') for abst in wikiPage]
    
    bigram = gensim.models.Phrases(absGramSplit, min_count=4, threshold=1000) 
    trigram = gensim.models.Phrases(bigram[absGramSplit], threshold=100)  

    # Faster way to get a sentence clubbed as a trigram/bigram
    bigram_mod = gensim.models.phrases.Phraser(bigram)
    trigram_mod = gensim.models.phrases.Phraser(trigram)

    def make_bigrams(texts):
        return [bigram_mod[doc] for doc in texts]

    def make_trigrams(texts):
        return [trigram_mod[bigram_mod[doc]] for doc in texts]

    data_words_trigrams = make_trigrams(absGramSplit)

    nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

    tri_lemmatized = lemmatization(data_words_trigrams, nlp, allowed_postags=['NOUN', 'ADJ', 'VERB'])

    id2word = corpora.Dictionary(tri_lemmatized)
    texts = tri_lemmatized
    corpus = [id2word.doc2bow(text) for text in texts]

    allTopics = []
    ldaModel = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=numWords, 
                                           random_state=133,
                                           update_every=10,
                                           chunksize=len(wikiPage),
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=False)           

    doc_lda = ldaModel[corpus]

    topics = []
    
    for idx, topic in ldaModel.show_topics(formatted=False, num_words= numWords):
        topics.extend([w[0] for w in topic])
    
    return topics

In [41]:
keyWordsList = []
for brand in phrasesList:
    topicsOfTopics = []
    if(brand[1] != ''):
        wikiTitle = brand[1]
        wikiPage = wikipedia.summary(title = wikiTitle, auto_suggest=False).split(' ')
        topics = findKeywords(brand[0], wikiPage, 10);
        topicsOfTopics = findKeywords(brand[0], topics, 2)
    
    keyWordsList.append([brand[0], topicsOfTopics])
    
print(keyWordsList)



[['Pizza Hut 🍕', ['serve', 'location', 'chain', 'large']], ['FARSÁLI - Beauty with Benefits', ['competition', 'chinese', 'young', 'know']], ['CREME PARA ESTRIAS', []], ['Massimo Dutti', ['multinational', 'luxury', 'pronunciation', 'spanish']], ['ROXY', ['quicksilver', 'gambling', 'amusement', 'machine']], ['Universal Studios Hollywood', ['park', 'offer', 'theme', 'city']], ['Barneys New York', ['brand', 'avenue', 'authentic', 'fifth']], ['Rogue Fitness', ['gym', 'base', 'equipment', 'box']], ['Philadelphia Eagles', ['football', 'appear', 'team', 'rivalry']], ['RubyRose_Oficial', []], ['Universal Orlando Resort', ['loew', 'royal', 'hotel', 'world']], ['Moda', ['silvestre', 'bass', 'guitarist', 'pop']], ['The Body Shop Official', ['natura', 'cosmetic', 'trading', 'sell']], ['Koton', ['store', 'retail', 'multinational', 'turkish']], ['dm-drogerie markt Deutschland', ['year', 'speaker', 'symposium', 'eb']], ['Shawn Johnson East', ['champion', 'world', 'team', 'balance']], ['Gymshark Women'

-----------------------

### Create Nodes and Relationships

Create Brand Nodes

In [42]:
nodes = []
for word in keyWordsList:
    brand = df_final[df_final['Brand_Name'] == word[0]].iloc[0]
    nodes.append([brand['brand_id'], "Brand", brand['Brand_Name'], word[1], brand['Brand_Followers'], \
                  brand['Brand_Followees'], brand['Brand_Posts'], brand["Brand_URL"], brand["Brand_Category"]])
        
brandNodeDf = pd.DataFrame(nodes, columns=['brand_ID', ':LABEL','Brand_Name','Keywords', "Number_Of_Followers", "Number_Of_Followees", "Number_Of_Posts", "Brand_Url", "Categories"])

In [43]:
brandNodeDf.head()

Unnamed: 0,brand_ID,:LABEL,Brand_Name,Keywords,Number_Of_Followers,Number_Of_Followees,Number_Of_Posts,Brand_Url,Categories
0,8209,Brand,Pizza Hut 🍕,"[serve, location, chain, large]",1510972.0,371.0,1444.0,http://www.PizzaHut.com/NFL,Restaurants
1,21273,Brand,FARSÁLI - Beauty with Benefits,"[competition, chinese, young, know]",1595583.0,82.0,4281.0,http://www.farsali.com/,Personal Goods & General Merchandise Stores
2,3209,Brand,CREME PARA ESTRIAS,[],1525982.0,347.0,2533.0,http://www.100estrias.com.br/,Personal Goods & General Merchandise Stores
3,24856,Brand,Massimo Dutti,"[multinational, luxury, pronunciation, spanish]",1576480.0,323.0,3002.0,http://mdutti.me/NorthWinds,Personal Goods & General Merchandise Stores
4,13263,Brand,ROXY,"[quicksilver, gambling, amusement, machine]",1511680.0,86.0,3789.0,http://roxy.com/,Publishers


Create Post Nodes

In [44]:
postNodeList = []
influNodeList = []
for index, row in df_final.iterrows():
    postNodeList.append([row['post_id'], 'Post', row['post_caption'], row['post_likes'], row['post_comments']])
    if row['Influencer_Name'] not in [inf[2] for inf in influNodeList]:
        influNodeList.append([row['influencer_id'], 'Influencer', row['Influencer_Name'], \
                              row['Influencer_Followers'], row['Influencer_Followees'], row['Influencer_Posts'], row['Influencer_Category']])
    
postNodesDf = pd.DataFrame(postNodeList, columns=['post_ID', 'LABEL', 'Caption', 'Number_of_Likes', 'Number_of_Comments'])
influNodesDf = pd.DataFrame(influNodeList, columns=['influencerID', 'LABEL', 'Influencer_Name', "Number_Of_Followers", "Number_of_Followees", \
                                                   "Number_Of_Posts", "Influencer_Category"])
postNodesDf.head()

Unnamed: 0,post_ID,LABEL,Caption,Number_of_Likes,Number_of_Comments
0,1877001634463122445,Post,Eu e meu chamego de sempre passando pelo seu f...,46572,236
1,1891646163346665904,Post,"Minha cara quando falam: ""Lore, tem sobremesa""...",54989,286
2,1916289352699275991,Post,Tô de olho em você que ainda não seguiu minha ...,67286,400
3,1936582221779537401,Post,Pq se nao tiver essa cara vocês sabem que não ...,61472,370
4,1961799656992243034,Post,Minha cara para quem fala que ainda não compro...,38757,289


In [50]:
influNodesDf.head()

Unnamed: 0,influencerID,LABEL,Influencer_Name,Number_Of_Followers,Number_of_Followees,Number_Of_Posts,Influencer_Category
0,35112,Influencer,Lorena Improta,5074442,1966,5127,Creators & Celebrities
1,13344,Influencer,Gabi Brandt,1782185,302,1803,Creators & Celebrities
2,29454,Influencer,CREME PARA ESTRIAS,555818,1951,2045,Lifestyle Services
3,18718,Influencer,Priscila Simões,389510,750,1337,Creators & Celebrities
4,4635,Influencer,MORGANA SANTANA,256163,509,2029,Creators & Celebrities


In [46]:
df_final.iloc[index]

brand_id                                                             7850
Brand_Name                                                           Wish
Brand_Followers                                                 1589576.0
Brand_Followees                                                     162.0
Brand_Posts                                                        3923.0
Brand_URL                              https://www.wish.com/j/IG_UGC_wk23
Brand_Category                                             Content & Apps
post_id                                               1905514232138759653
post_caption            Pretty sure this bracelet flask will be my fav...
post_likes                                                            607
post_comments                                                         183
influencer_id                                                       13646
Influencer_Name                                                     Mandy
Influencer_Followers                  

In [51]:
relbrandtopost = []
for index, row in df_final.iterrows():
    postId = postNodesDf[(postNodesDf['Caption'] == row['post_caption']) \
                         & (postNodesDf['Number_of_Likes'] == row['post_likes'])]['post_ID'].iloc[0]
    brandId = brandNodeDf[brandNodeDf['Brand_Name'] == row['Brand_Name']]['brand_ID'].iloc[0]
    relbrandtopost.append([brandId, "SPONSORED", postId])
edge_brandtopost = pd.DataFrame(relbrandtopost, columns=['brand_ID', 'TYPE', 'post_ID'])
edge_brandtopost.head()

Unnamed: 0,brand_ID,TYPE,post_ID
0,3209,SPONSORED,1877001634463122445
1,3209,SPONSORED,1891646163346665904
2,3209,SPONSORED,1916289352699275991
3,3209,SPONSORED,1936582221779537401
4,3209,SPONSORED,1961799656992243034


In [52]:
relinfltopost = []
for index, row in df_final.iterrows():
    if(str(row['Influencer_Name']) != 'nan'):
        postId = postNodesDf[(postNodesDf['Caption'] == row['post_caption']) \
                             & (postNodesDf['Number_of_Likes'] == row['post_likes'])]['post_ID'].iloc[0]
        influId = influNodesDf[influNodesDf['Influencer_Name'] == row['Influencer_Name']]['influencerID'].iloc[0]
        relinfltopost.append([influId, "POSTED", postId])
edge_relinfltopost = pd.DataFrame(relinfltopost, columns=['influencer_ID', 'TYPE', 'post_ID'])
edge_relinfltopost.head()

Unnamed: 0,influencer_ID,TYPE,post_ID
0,35112,POSTED,1877001634463122445
1,35112,POSTED,1891646163346665904
2,35112,POSTED,1916289352699275991
3,35112,POSTED,1936582221779537401
4,35112,POSTED,1961799656992243034


In [53]:
influNodesDf.to_csv("files/InfluencerNodes.csv", index=False)
postNodesDf.to_csv("files/PostNodes.csv", index=False)
brandNodeDf.to_csv("files/BrandNodes.csv", index=False)
edge_relinfltopost.to_csv("files/Edges_Infl.csv", index=False)
edge_brandtopost.to_csv("files/Edges_Brand.csv", index=False)
#relationList.to_csv("files/relationList.csv", index=False)

In [None]:
#./bin/neo4j-admin import --force --multiline-fields=true --nodes=./import/1_node.csv --relationships=./import/1_rel.csv

### Famous Person Webscrape Dataset

In [54]:
df_influencer = df_final["Influencer_Name"]
df_influencer

0                     Lorena Improta
1                     Lorena Improta
2                     Lorena Improta
3                     Lorena Improta
4                     Lorena Improta
                   ...              
437    Tiffany 🌹 - Beauty Influencer
438                    🌸 P I N V R 🌸
439                        Annie Cho
440          Hi y’all, I’m Brandy G!
441                            Mandy
Name: Influencer_Name, Length: 442, dtype: object

In [55]:
influencer_names = df_influencer.unique()

In [56]:
#remove emoji symbols from their names
emoji_pattern  = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
def remove_emoji(input_string):
    return emoji_pattern.sub(r'', input_string ) # No emoji

no_emoji =  [remove_emoji(i) if str(i) != 'nan' else i for i in influencer_names ]

print(no_emoji)

['Lorena Improta', 'Gabi Brandt', 'CREME PARA ESTRIAS', 'Priscila Simões', 'MORGANA SANTANA', 'F2Freestylers', 'Tom Daley', 'bradleysimmonds', 'TASHA GREEN', 'N S', 'Sophie Hellyer', 'Khloé', 'Vanessa Hudgens', 'Priyanka Chopra', 'Adriana Lima', 'Jessica Biel', 'Nia Sioux', 'Russell Wilson', 'Olivia Culpo', 'Dr. Holly Hatcher-Frazier', 'Kaycee Rice', 'Peter McKinnon', 'Sydney Leroux Dwyer', 'sung kang', 'Meghan King Edmonds', 'bri emery / designlovefest', 'Emily Luciano', 'Claire Godard', 'Farrah Davidson', 'KINDNESS • BEAUTY • LIFESTYLE', 'Franko Dean', 'THANIA | Fashion & Beauty Blog', 'The Perennial Style', 'Victoria Hui', 'Raysa Garcia', 'Mr.kool //Austin ™', 'TheSequinHanger', 'LaTonya', 'Carolyn', '| San Diego Blogger', 'T™ ', 'IDM', 'fᎪshᏁᏟuᏒᎥᎾus: #Fashion #Beauty', 'Opal Stewart', 'Alexandre | الكساندر', 'Amlul.com', 'ManhattanGirl', 'Tina Craig', 'THELIMITDOESNOTEXIST', 'yael steren', 'All The Pretty Birds', 'By Sophie and Charlotte ', nan, 'ELLEN COLE', 'SAVAGE IS THE KING ',

In [57]:
#clean up punctuation from list of influencer names

def remove_punc(string):
    punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~•'''
    for ele in string:  
        if ele in punc:  
            string = string.replace(ele, " ") 
    return string
 
no_punc_name = [remove_punc(i) if str(i) != 'nan' else i for i in no_emoji]
no_punc_name = [i.strip() if str(i) != 'nan' else i  for i in no_punc_name]

print(no_punc_name[:5]) # cleaned list and spaces stripped

['Lorena Improta', 'Gabi Brandt', 'CREME PARA ESTRIAS', 'Priscila Simões', 'MORGANA SANTANA']


  punc = '''!()-[]{};:'"\,<>./?@#$%^&*_~•'''


In [58]:
clean_parens_text = [re.sub("[\(\[].*?[\)\]]", "", x)if str(x) != 'nan' else x for x in no_punc_name ]

clean_parens_text[:5]

  clean_parens_text = [re.sub("[\(\[].*?[\)\]]", "", x)if str(x) != 'nan' else x for x in no_punc_name ]


['Lorena Improta',
 'Gabi Brandt',
 'CREME PARA ESTRIAS',
 'Priscila Simões',
 'MORGANA SANTANA']

In [59]:
#clean up prefix titles
titles = ("MR","DR","MRS","PROF","MS")
ptrn = re.compile(fr"^({'|'.join(titles)})\.?\s+", flags=re.I)
clean_names = [ptrn.sub("", i) if str(i) != 'nan' else i for i in clean_parens_text]
print(clean_names[:5])


['Lorena Improta', 'Gabi Brandt', 'CREME PARA ESTRIAS', 'Priscila Simões', 'MORGANA SANTANA']


In [60]:
len(clean_names)

279

#### Scrape Famous person attributes data if it exists

In [64]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service as ChromeService
from webdriver_manager.chrome import ChromeDriverManager

In [None]:
name_list = []
day_list = []
year_list = []
age_list = []
birth_list = []


options = Options()
DRIVER_PATH = '/Users/squach/Downloads/chromedriver'
driver = webdriver.Chrome(options=options, executable_path=DRIVER_PATH)


# options.headless = True
options.add_argument("--window-size=1920,1200")
action = webdriver.ActionChains(driver)

xpath_name = '/html/body/div[1]/div[1]/div/div/div[2]/div/div/div[1]/div/div[1]/h1'
xpath_date =  '/html/body/div[1]/div[1]/div/div/div[2]/div/div/div[2]/div/div[1]/div/div[1]/div/a[1]'
xpath_year = '/html/body/div[1]/div[1]/div/div/div[2]/div/div/div[2]/div/div[1]/div/div[1]/div/a[2]'
xpath_age = '/html/body/div[1]/div[1]/div/div/div[2]/div/div/div[2]/div/div[1]/div/div[3]/div/a'
birthplace_path = '/html/body/div[1]/div[1]/div/div/div[2]/div/div/div[2]/div/div[1]/div/div[2]/div'

for names in clean_names:
    if str(names) != 'nan' and names.strip() != '':
        driver.get("https:www.famousbirthdays.com")

        inputElement = driver.find_element("id","main-search")
        inputElement.send_keys(names)

        #print("looking at this name", names)

        inputElement.send_keys(Keys.ENTER)

        #wait 4.5 seconds so the site doesn't ask to confirm that i'm not a bot
        time.sleep(4.5) 

        try:
            name = driver.find_element("xpath",xpath_name).text
            date = driver.find_element("xpath",xpath_date).text
            year = driver.find_element("xpath",xpath_year).text
            age = driver.find_element("xpath",xpath_age).text
            birthplace = driver.find_element("xpath",birthplace_path).text

            name_list.append(name)
            day_list.append(date)
            year_list.append(year)
            age_list.append(age)
            birth_list.append(birthplace)
        except:
            name_list.append(names)
            day_list.append('n/a')
            year_list.append('n/a')
            age_list.append('n/a')
            birth_list.append('n/a')
    else:
        name_list.append(names)
        day_list.append('n/a')
        year_list.append('n/a')
        age_list.append('n/a')
        birth_list.append('n/a')
        
        
        driver.back()
    
    
    driver.back()

driver.close()
driver.quit()

In [None]:
# it was easier to grab the Xpath to all the information under the Birthplace div class an just clean it up
# afterwards. Some only had the country listed, and for the City, States located separately and end up 
# combining it afterwards. 

birth_list_clean = [s.replace('BIRTHPLACE\n', '') for s in birth_list]
print(birth_list_clean[:10])

In [None]:
influencer_about_df = pd.DataFrame({'Unique_Influencer_Name':influencer_names})

In [None]:
# influencer_about_df

influencer_about_df['Clean_Name'] = name_list
influencer_about_df['Birthdate'] = day_list
influencer_about_df['Birth_Year'] = year_list
influencer_about_df['Age'] = age_list
influencer_about_df['Birthplace'] = birth_list_clean

In [None]:
influencer_about_df.to_csv("Influencer_Info_Scrape.csv", index=False)

In [None]:
influencer_about_df

In [None]:
influNodesDf

In [None]:
influNodesFinalDf = pd.merge(influNodesDf, influencer_about_df, how='inner', \
                             left_on = 'Influencer_Name', right_on = 'Unique_Influencer_Name')
influNodesFinalDf
influNodesFinalDf.head()

In [None]:
influNodesDf.to_csv("files/InfluencerNodes.csv", index=False)
postNodesDf.to_csv("files/PostNodes.csv", index=False)
brandNodeDf.to_csv("files/BrandNodes.csv", index=False)
edge_relinfltopost.to_csv("files/Edges_Infl.csv", index=False)
edge_brandtopost.to_csv("files/Edges_Brand.csv", index=False)
#relationList.to_csv("files/relationList.csv", index=False)