In [1]:
!pip install python-dotenv
!pip install pandas
!pip install faker
!pip install beautifulsoup4 rake-nltk



In [2]:
pip freeze > requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [3]:
from appwrite.client import Client
from appwrite.services.databases import Databases
from appwrite.id import ID
from dotenv import load_dotenv
import os

load_dotenv()

True

##### Library for natural language processing

In [4]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/rafxtgt/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [5]:
client = Client()
client.set_endpoint(os.getenv('APPWRITE_API_ENDPOINT'))
client.set_project(os.getenv('APPWRITE_PROJECT'))
client.set_key(os.getenv('APPWRITE_API_KEY'))

<appwrite.client.Client at 0x7f7feffd9290>

In [6]:
databases = Databases(client)
database_name = 'fire-sale-db'

In [9]:
try:
    # Try to get the database to see if it exists
    database_list = databases.list()
    existing_db = next((db for db in database_list['databases'] if db['name'] == database_name), None)
    
    if existing_db:
        fireSaleDb = existing_db
        print(f"Database '{database_name}' already exists with ID: {existing_db['$id']}")
    else:
        # Create the database if it doesn't exist
        fireSaleDb = databases.create(
            database_id=ID.unique(),
            name=database_name
        )
        print(f"Created new database '{database_name}' with ID: {fireSaleDb['$id']}")
except Exception as e:
    print(f"Error initializing database: {e}")
    raise

Database 'fire-sale-db' already exists with ID: 6812c6490009447d68d4


In [10]:
fireSaleDb

{'$id': '6812c6490009447d68d4',
 'name': 'fire-sale-db',
 '$createdAt': '2025-05-01T00:54:33.779+00:00',
 '$updatedAt': '2025-05-01T00:54:33.779+00:00',
 'enabled': True,
 'policies': [],
 'archives': []}

#### User Profile Collectin

In [26]:
userProfileCollection = None

def prepare_user_profile_collection():
  global userProfileCollection
  try:
    collections = databases.list_collections(database_id=fireSaleDb['$id'])
    for collection in collections['collections']:
        if collection['name'] == 'user-profile':
            userProfileCollection = collection
            print("User profile collection already exists")
            return
  except Exception as e:
    print(f"Error checking for existing collection: {e}")
      
    
  userProfileCollection = databases.create_collection(
    database_id=fireSaleDb['$id'],
    collection_id=ID.unique(),
    name='user-profile'
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=userProfileCollection['$id'],
    key='user_id',
    size=255,
    required=True
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=userProfileCollection['$id'],
    key='name',
    size=255,
    required=False
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=userProfileCollection['$id'],
    key='email',
    size=255,
    required=False
  )
    
  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=userProfileCollection['$id'],
    key='label',
    size=255,
    required=True
  )
    
  databases.create_integer_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=userProfileCollection['$id'],
    key='age',
    required=False,
    min=18,
    max=150
  )
    
  databases.create_datetime_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=userProfileCollection['$id'],
    key='signup_date',
    required=True
  )
  print("User profile collection created successfully !")


#### Social Media Collection

In [36]:
socialMediaCollection = None

def prepare_social_media_collection():
    global socialMediaCollection

    try:
        collections = databases.list_collections(database_id=fireSaleDb['$id'])
        for collection in collections['collections']:
            if collection['name'] == 'social-media':
                socialMediaCollection = collection
                print("Social media collection already exists")
                return
    except Exception as e:
        print(f"Error checking for existing collection: {e}")
    
    # If collection doesn't exist, create it
    socialMediaCollection = databases.create_collection(
        database_id=fireSaleDb['$id'],
        collection_id=ID.unique(),
        name='social-media'
    )

    # Create all the attributes
    databases.create_string_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key='guid',
        size=255,
        required=True
    )
        
    databases.create_string_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key='user_id',
        size=255,
        required=False
    )

    databases.create_string_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key='platform',
        size=255,
        required=False
    )

    databases.create_integer_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key='post_count',
        required=False,
        min=0
    )  

    databases.create_integer_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key='like_count',
        required=False,
        min=0
    )  

    databases.create_string_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key='groups',
        required=False, 
        size=131072
    )    
        
    databases.create_integer_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key='follower_count',
        required=False,
        min=0
    ) 
        
    databases.create_string_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key='top_interests',
        required=False, 
        size=131072
    )  
        
    databases.create_datetime_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key='last_active',
        required=True
    )
    
    print("Created new social media collection")

#### Communication Collection

In [31]:
communicationCollection = None

def prepare_communication_collection():
  global communicationCollection
  try:
    collections = databases.list_collections(database_id=fireSaleDb['$id'])
    for collection in collections['collections']:
        if collection['name'] == 'communication':
            communicationCollection = collection
            print("Communication collection already exists")
            return
  except Exception as e:
    print(f"Error checking for existing collection: {e}")
    

  communicationCollection = databases.create_collection(
    database_id=fireSaleDb['$id'],
    collection_id=ID.unique(),
    name='communication'
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=communicationCollection['$id'],
    key='guid',
    size=255,
    required=True
  )
    
  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=communicationCollection['$id'],
    key='user_id',
    size=255,
    required=False
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=communicationCollection['$id'],
    key='medium',
    size=255,
    required=False
  )
    
  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=communicationCollection['$id'],
    key='contacted_user_id',
    size=255,
    required=False
  )

  databases.create_integer_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=communicationCollection['$id'],
    key='interaction_count',
    required=False,
    min=0
  )  

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=communicationCollection['$id'],
    key='topics',
    required=False, 
    size=131072
  )    
    
  databases.create_datetime_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=communicationCollection['$id'],
    key='last_contact_date',
    required=False
  )
  print("Communication collection created successfully !")


#### Location Collection

In [32]:
locationCollection = None

def prepare_location_collection():
  global locationCollection
  try:
    collections = databases.list_collections(database_id=fireSaleDb['$id'])
    for collection in collections['collections']:
        if collection['name'] == 'location':
            locationCollection = collection
            print("Location collection already exists")
            return
  except Exception as e:
    print(f"Error checking for existing collection: {e}")

    
  locationCollection = databases.create_collection(
    database_id=fireSaleDb['$id'],
    collection_id=ID.unique(),
    name='location'
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=locationCollection['$id'],
    key='guid',
    size=255,
    required=True
  )
    
  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=locationCollection['$id'],
    key='user_id',
    size=255,
    required=False
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=locationCollection['$id'],
    key='location_name',
    size=255,
    required=False
  )
    
  databases.create_float_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=locationCollection['$id'],
    key='latitude',
    required=False
  )
    
  databases.create_float_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=locationCollection['$id'],
    key='longitude',
    required=False
  )
   
  databases.create_datetime_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=locationCollection['$id'],
    key='check_in_time',
    required=False
  )
    
  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=locationCollection['$id'],
    key='companions',
    required=False, 
    size=131072
  )
  print("Location collection created successfully !")

    

#### Behavioral Metadata

In [33]:
behaviorMetadataCollection = None

def prepare_behaviour_metadata_collection():
  global behaviorMetadataCollection
  try:
    collections = databases.list_collections(database_id=fireSaleDb['$id'])
    for collection in collections['collections']:
        if collection['name'] == 'behavioral-metadata':
            behaviorMetadataCollection = collection
            print("Behavioral metadata collection already exists")
            return
  except Exception as e:
    print(f"Error checking for existing collection: {e}")

  behaviorMetadataCollection = databases.create_collection(
    database_id=fireSaleDb['$id'],
    collection_id=ID.unique(),
    name='behavioral-metadata'
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=behaviorMetadataCollection['$id'],
    key='guid',
    size=255,
    required=True
  )
    
  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=behaviorMetadataCollection['$id'],
    key='user_id',
    size=255,
    required=False
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=behaviorMetadataCollection['$id'],
    key='device_type',
    size=255,
    required=False
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=behaviorMetadataCollection['$id'],
    key='active_hours',
    size=255,
    required=False
  )
      
  databases.create_float_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=behaviorMetadataCollection['$id'],
    key='average_daily_screen_time',
    required=False
  )

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=behaviorMetadataCollection['$id'],
    key='preferred_app_categories',
    required=False, 
    size=131072
  )
  print("Behavioral metadata collection created successfully !")


#### InferredProfile

In [34]:
inferredProfileCollection = None

def prepare_inferred_profile_collection():
  global inferredProfileCollection
  try:
    collections = databases.list_collections(database_id=fireSaleDb['$id'])
    for collection in collections['collections']:
        if collection['name'] == 'inferred-profile':
            inferredProfileCollection = collection
            print("Inferred profile collection already exists")
            return
  except Exception as e:
    print(f"Error checking for existing collection: {e}")

  inferredProfileCollection = databases.create_collection(
    database_id=fireSaleDb['$id'],
    collection_id=ID.unique(),
    name='inferred-profile'
  )
    
  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=inferredProfileCollection['$id'],
    key='user_id',
    size=255,
    required=False
  )

  databases.create_float_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=inferredProfileCollection['$id'],
    key='influence_score',
    required=False
  )

    
  databases.create_float_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=inferredProfileCollection['$id'],
    key='trust_score',
    required=False
  )


  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=inferredProfileCollection['$id'],
    key='hidden_roles',
    required=False, 
    size=131072
  )  

  databases.create_string_attribute(
    database_id=fireSaleDb['$id'],
    collection_id=inferredProfileCollection['$id'],
    key='hidden_communities',
    required=False, 
    size=131072
  )  
  print("Inferred profile collection created successfully !")


#### Create the database collections

In [44]:
def add_new_string_attribute(attribute_name: str, size: int = 255, required: bool = False):
    global socialMediaCollection
    
    if not socialMediaCollection:
        raise Exception("Social media collection not initialized. Call prepare_social_media_collection() first.")
    
    # Check if attribute already exists
    existing_attributes = databases.list_attributes(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id']
    )['attributes']
    
    if any(attr['key'] == attribute_name for attr in existing_attributes):
        print(f"Attribute '{attribute_name}' already exists")
        return
    
    # Create new string attribute
    databases.create_string_attribute(
        database_id=fireSaleDb['$id'],
        collection_id=socialMediaCollection['$id'],
        key=attribute_name,
        size=size,
        required=required
    )
    print(f"Added new attribute '{attribute_name}'")

In [45]:
add_new_string_attribute("work_exp", size=131072)

Added new attribute 'work_exp'


In [37]:
if __name__ == "__main__":
  prepare_user_profile_collection()
  prepare_social_media_collection()
  prepare_communication_collection()
  prepare_location_collection()
  prepare_behaviour_metadata_collection()
  prepare_inferred_profile_collection()

User profile collection already exists
Created new social media collection
Communication collection already exists
Location collection already exists
Behavioral metadata collection already exists
Inferred profile collection already exists


In [29]:
result = databases.delete(
    database_id = '680fc5380017320a8568'
)

# Data Processing

#### Create synthetic user profile data csv from the pseudo file

In [10]:
import pandas as pd
from datetime import datetime
from faker import Faker
import uuid

# Initialize Faker generator
fake = Faker()
Faker.seed(42)  # For reproducible results

# Read the CSV file
df = pd.read_csv('dataset/pseudo_facebook.csv')

# Calculate current year and user age
current_year = datetime.now().year
df['user_age'] = current_year - df['dob_year']
df['label'] = 'NODE'

# Generate unique synthetic data
num_rows = len(df)
unique_names = set()
unique_emails = set()

while len(unique_names) < num_rows:
    unique_names.add(fake.unique.name())

while len(unique_emails) < num_rows:
    unique_emails.add(fake.unique.email())

# Add the synthetic columns
df['name'] = list(unique_names)
df['email'] = list(unique_emails)
df['user_guid'] = [str(uuid.uuid4()) for _ in range(len(df))]

# Verify uniqueness
# print(f"Unique names generated: {len(df['name'].unique()) == len(df)}")
# print(f"Unique emails generated: {len(df['email'].unique()) == len(df)}")

# Display the DataFrame with new age column

selected_columns = ['user_guid', 'name', 'email', 'label', 'user_age']  
new_df = df[selected_columns]
print(new_df.head())
new_df.to_csv('dataset/synthetic_user_profile_data.csv', index=False)

print(f"New CSV created with columns: {selected_columns}")

                              user_guid            name  \
0  6c9799d2-dac3-4098-8fd3-1d0f7ae138bb  Jennifer Blair   
1  186648a5-0093-46a1-8932-b696aeb8ed09   Alfred Cooper   
2  1a3965d4-ccfb-4a8f-827a-8585a7ad1dc1     Debra Cohen   
3  c39101b7-5518-4105-9072-0ba3a1abb1ca   Helen Holland   
4  16741755-a8b9-4f5e-883b-4372fa0795bd   Monica Hansen   

                          email label  user_age  
0            troy75@example.org  NODE        26  
1          pamela72@example.com  NODE        26  
2      villamichael@example.org  NODE        26  
3  hendersonstephen@example.com  NODE        26  
4           mclarke@example.org  NODE        26  
New CSV created with columns: ['user_guid', 'name', 'email', 'label', 'user_age']


#### Create a smaller file for user interaction from facebook

In [4]:
import pandas as pd

# Replace 'your_file.csv' with your actual file path
file_path = 'dataset/fbpac-ads-en-US.csv'

# Read the CSV file
df = pd.read_csv(file_path)
headers = df.columns.tolist()

# Display the first 5 rows with headers
df.head()
print(headers)

['id', 'html', 'political', 'not_political', 'title', 'message', 'thumbnail', 'created_at', 'updated_at', 'lang', 'images', 'impressions', 'political_probability', 'targeting', 'suppressed', 'targets', 'advertiser', 'entities', 'page', 'lower_page', 'targetings', 'paid_for_by', 'targetedness', 'listbuilding_fundraising_proba']


In [6]:
import pandas as pd

# Read original CSV
file_path = 'dataset/fbpac-ads-en-US.csv'

df = pd.read_csv(file_path)

# Select columns you want to keep (adjust names as needed)
selected_columns = ['title', 'message', 'advertiser', 'entities', 'page']  

# Create new DataFrame with only selected columns
new_df = df[selected_columns]

# Save to new CSV
new_df.to_csv('dataset/compressed_ad_data.csv', index=False)

print(f"New CSV created with columns: {selected_columns}")

New CSV created with columns: ['title', 'message', 'advertiser', 'entities', 'page']


#### Twitter interaction data csv

In [47]:
import pandas as pd

# Read original CSV
file_path_ = 'dataset/twitter_dataset_1.csv'

# Try different encodings
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    try:
        df = pd.read_csv(file_path, encoding='latin1')
    except UnicodeDecodeError:
        df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Rename columns
new_columns = ['label', 'user_id', 'post_date', 'type', 'user_name', 'tweet']
if len(df.columns) == len(new_columns):
    df.columns = new_columns  # Rename all columns at once
else:
    print("Warning: Column count does not match. Keeping original column names.")

selected_columns = ['tweet', 'label']  
tweet_df_1 = df[selected_columns]
print(tweet_df_1.head())
print("Total number of rows:", tweet_df_1.shape[0])

# # Display results
# print("\nFirst 5 rows:")
# print(df.head())

# print("\nColumn headers:")
# print(df.columns.tolist())

# print("\nNumber of columns:", len(df.columns))
# print("Total number of rows:", df.shape[0])

                                               tweet  label
0  is upset that he can't update his Facebook by ...      0
1  @Kenichan I dived many times for the ball. Man...      0
2    my whole body feels itchy and like its on fire       0
3  @nationwideclass no, it's not behaving at all....      0
4                      @Kwesidei not the whole crew       0
Total number of rows: 1599999


In [48]:
import pandas as pd

# Read original CSV
file_path_d1 = 'dataset/twitter_dataset_2.csv'
selected_columns = ['tweet', 'label']  

df1 = pd.read_csv(file_path_d1)
tweet_df_2 = df1[selected_columns]

# print(df1.head())
print(tweet_df_2.head())
print("Total number of rows:", tweet_df_2.shape[0])


                                               tweet  label
0  just had a real good moment. i missssssssss hi...      0
1         is reading manga  http://plurk.com/p/mzp1e      0
2  @comeagainjen http://twitpic.com/2y2lx - http:...      0
3  @lapcat Need to send 'em to my accountant tomo...      0
4      ADD ME ON MYSPACE!!!  myspace.com/LookThunder      0
Total number of rows: 10314


In [49]:
import pandas as pd

# Concatenate the two DataFrames vertically (row-wise)
combined_df = pd.concat([tweet_df_2, tweet_df_1], ignore_index=True)

# Save the combined DataFrame to a CSV file
combined_df.to_csv('dataset/combined_tweets.csv', index=False)  # index=False avoids saving an extra index column

print(f"Combined DataFrame shape: {combined_df.shape}")
print("CSV file saved as 'combined_tweets.csv'")

Combined DataFrame shape: (1610313, 2)
CSV file saved as 'combined_tweets.csv'


#### LinkedIn User profile and company data csv

In [2]:
import pandas as pd

# Read original CSV
file_path_d1 = 'dataset/LinkedIn_people_profiles_datasets.csv'

df1 = pd.read_csv(file_path_d1)

# print(df1.head())
print(df1.head())
print("Total number of rows:", df1.shape[0])
print("\nColumn headers:")
print(df1.columns.tolist())

    timestamp                            id  \
0  2023-01-10            catherinemcilkenny   
1  2022-12-17           margot-bon-51a04624   
2  2023-05-17            mike-dean-8509a193   
3  2022-05-29  giovanna-panarella-99a0a4167   
4  2022-12-06         steve-latimer-3364327   

                                     name                       city  \
0  Catherine Fitzpatrick (McIlkenny), B.A                     Canada   
1                              Margot Bon  The Randstad, Netherlands   
2                               Mike Dean    England, United Kingdom   
3                      Giovanna Panarella  Avellino, Campania, Italy   
4                           Steve Latimer            Ontario, Canada   

  country_code region     current_company:company_id  \
0           CA    NaN                            NaN   
1           NL     EU               gemeente-utrecht   
2           UK    NaN                   network-rail   
3           IT     EU                            NaN   
4   

In [9]:
df1['current_company_name'] = df1['current_company:name']
selected_columns = ['current_company_name', 'position', 'about', 'posts', 'groups', 'experience', 'educations_details']
linkedIn_df = df1[selected_columns]
print(linkedIn_df.head())
linkedIn_df.to_csv('dataset/linkedin_user_profile.csv', index=False)


            current_company_name  \
0                            NaN   
1               Gemeente Utrecht   
2                   Network Rail   
3                      Freelance   
4  Mid-Range Computer Group Inc.   

                                            position  \
0  Snr Business Analyst at Emploi et Développemen...   
1  Communicatieadviseur Corporate & Strategie Gem...   
2               Network Data Manager at Network Rail   
3                             Architetto (Freelance)   
4  Senior Account Executive at Mid-Range Computer...   

                                               about  \
0                                                NaN   
1  Allround Marketing & Communicatie Adviseur met...   
2  Experienced Data Manager with a demonstrated h...   
3                                                NaN   
4                                                NaN   

                                               posts  \
0  [{"attribution":"Liked by Catherine Fitzpatric... 

#### LinkedIn company profile

In [15]:
import pandas as pd

# Read original CSV
file_path_d1 = 'dataset/linkedin_company_profile.csv'

df1 = pd.read_csv(file_path_d1)

# print(df1.head())
# print(df1.head())
print("Total number of rows:", df1.shape[0])
print("\nColumn headers:")
print(df1.columns.tolist())
selected_columns = ['name', 'industries', 'specialties', 'about', 'organization_type']
linkedIn_comp_df = df1[selected_columns]
print(linkedIn_comp_df.head())
linkedIn_comp_df.to_csv('dataset/linkedin_comp_profile.csv', index=False)


Total number of rows: 1000

Column headers:
['timestamp', 'id', 'name', 'country_code', 'locations', 'formatted_locations', 'followers', 'employees_in_linkedin', 'about', 'specialties', 'company_size', 'organization_type', 'industries', 'website', 'crunchbase_url', 'founded', 'company_id', 'employees', 'headquarters', 'image', 'logo', 'similar', 'sphere', 'url', 'type', 'updates', 'slogan', 'affiliated', 'funding', 'stock_info', 'investors']
                                        name  \
0  Be Nijs * Business- & Concept Development   
1             Texas Deaf Chamber of Commerce   
2                                CellPraxis®   
3                        DIAMOND TOOLS PLANT   
4                            Thieves Kitchen   

                           industries  \
0    Business Consulting and Services   
1      Civic and Social Organizations   
2              Biotechnology Research   
3  Industrial Machinery Manufacturing   
4                    Media Production   

                  

### Data Aggregation

In [34]:
import pandas as pd

# Read the CSV file
user_profile_df = pd.read_csv('dataset/synthetic_user_profile_data.csv')
print(user_profile_df.head())
print("\nColumn headers:")
print(user_profile_df.columns.tolist())
print("Total number of rows:", user_profile_df.shape[0])


                              user_guid            name  \
0  6c9799d2-dac3-4098-8fd3-1d0f7ae138bb  Jennifer Blair   
1  186648a5-0093-46a1-8932-b696aeb8ed09   Alfred Cooper   
2  1a3965d4-ccfb-4a8f-827a-8585a7ad1dc1     Debra Cohen   
3  c39101b7-5518-4105-9072-0ba3a1abb1ca   Helen Holland   
4  16741755-a8b9-4f5e-883b-4372fa0795bd   Monica Hansen   

                          email label  user_age  
0            troy75@example.org  NODE        26  
1          pamela72@example.com  NODE        26  
2      villamichael@example.org  NODE        26  
3  hendersonstephen@example.com  NODE        26  
4           mclarke@example.org  NODE        26  

Column headers:
['user_guid', 'name', 'email', 'label', 'user_age']
Total number of rows: 99003


#### Push first 500 rows into user profile dataframe

In [36]:
from datetime import datetime
import pandas as pd

def upload_user_profiles(df, limit=500):
    # Take the first 500 rows
    subset = df.head(limit)
    
    # Prepare the collection if not already prepared
    if userProfileCollection is None:
        prepare_user_profile_collection()
    
    # Initialize counters
    success_count = 0
    error_count = 0
    
    # Iterate through each row and create document
    for index, row in subset.iterrows():
        try:
            # Map the CSV columns to our collection attributes
            document_data = {
                'user_id': row['user_guid'],
                'name': row['name'],
                'email': row['email'],
                'label': row['label'],
                'age': int(row['user_age']) if pd.notna(row['user_age']) else None,
                'signup_date': datetime.now().isoformat()  # Using current time as signup date
            }
            
            # Create the document in Appwrite
            result = databases.create_document(
                database_id=fireSaleDb['$id'],
                collection_id=userProfileCollection['$id'],
                document_id=ID.unique(),
                data=document_data
            )
            
            success_count += 1
            if success_count % 50 == 0:  # Print progress every 50 records
                print(f"Processed {success_count} records...")
                
        except Exception as e:
            error_count += 1
            print(f"Error processing row {index}: {str(e)}")
            continue
    
    print(f"\nUpload completed with {success_count} successes and {error_count} errors.")

In [37]:
# Load your CSV data
user_profile_df = pd.read_csv('dataset/synthetic_user_profile_data.csv')

# Upload the first 500 rows
upload_user_profiles(user_profile_df, 500)

Processed 50 records...
Processed 100 records...
Processed 150 records...
Processed 200 records...
Processed 250 records...
Processed 300 records...
Processed 350 records...
Processed 400 records...
Processed 450 records...
Processed 500 records...

Upload completed with 500 successes and 0 errors.


#### Link user profile to the facebook ad data.

In [20]:
import pandas as pd

# Read the CSV file
facebook_ad_df = pd.read_csv('dataset/synthetic_facebook_ad_data.csv')
print(facebook_ad_df.head())
print("\nColumn headers:")
print(facebook_ad_df.columns.tolist())
print("Total number of rows:", facebook_ad_df.shape[0])


                            title  \
0   League of Conservation Voters   
1               Indivisible Guide   
2  International Rescue Committee   
3    Covenant House International   
4              Planned Parenthood   

                                             message  \
0  <p>BREAKING: Trump’s Department of the Interio...   
1  <p>The Mueller investigation is over. Special ...   
2  <p>Zimbabwe is reeling from the impact of Cycl...   
3  <p>What more can you do in the final hours of ...   
4  <p>Say it loud, say it proud: Our rights, our ...   

                     advertiser  \
0                           NaN   
1                           NaN   
2                           NaN   
3  Covenant House International   
4                           NaN   

                                            entities  \
0  [{"entity": "Endangered Species Act", "entity_...   
1  [{"entity": "Americans", "entity_type": "Group...   
2  [{"entity": "Zimbabwe", "entity_type": "Region"}]   
3    

In [41]:
import random
from datetime import datetime, timedelta
import json
from bs4 import BeautifulSoup
from rake_nltk import Rake
import pandas as pd

def extract_message_text(html_content):
    """Extract text from HTML message content"""
    soup = BeautifulSoup(html_content, 'html.parser')
    return soup.get_text(separator=' ', strip=True)

def extract_keywords(text, num_keywords=3):
    """Extract keywords from text using RAKE"""
    r = Rake()
    r.extract_keywords_from_text(text)
    keywords = r.get_ranked_phrases()[:num_keywords]
    return keywords

def get_random_user_ids(count):
    """Get random user IDs from userProfileCollection"""
    user_ids = []
    try:
        # Get list of users (adjust limit as needed)
        users = databases.list_documents(
            database_id=fireSaleDb['$id'],
            collection_id=userProfileCollection['$id']
        )
        user_ids = [user['user_id'] for user in users['documents']]
    except Exception as e:
        print(f"Error fetching user IDs: {str(e)}")
    
    # If we couldn't fetch users, generate some dummy IDs (fallback)
    if not user_ids:
        print("Couldn't fetch user ids, so using dummy ones !!!")
        user_ids = [f"user_{i}" for i in range(1, 1001)]
    
    return random.choices(user_ids, k=count)

def upload_social_media_data(df, total_records=5000):
    """Upload random social media data with 1:5 user ratio"""
    # Prepare the collection if not already prepared
    if socialMediaCollection is None:
        prepare_social_media_collection()
    
    # Calculate number of users needed (1:5 ratio)
    num_users = total_records // 5
    user_ids = get_random_user_ids(num_users)
    
    # Initialize counters
    success_count = 0
    error_count = 0
    
    # Process records
    for i in range(total_records):
        try:
            # Pick a random row from the dataframe
            random_row = df.sample(n=1).iloc[0]
            
            # Extract message text from HTML
            message_text = extract_message_text(random_row['message'])
            
            # Extract keywords
            keywords = extract_keywords(message_text, random.randint(1, 3))
            
            # Process entities to extract groups and organizations
            entity_list = []
            try:
                entities = json.loads(random_row['entities'])
                for entity_obj in entities:
                    if entity_obj['entity_type'] in ['Group', 'Organization']:
                        entity_value = entity_obj['entity'].lower().strip()
                        if entity_value not in entity_list:  # Avoid duplicates
                            entity_list.append(entity_value)
            except (json.JSONDecodeError, KeyError, AttributeError) as e:
                print(f"Error processing entities for record {i}: {str(e)}")
                entity_list = ["facebook_group"]  # Fallback value
            
            # Create document data
            document_data = {
                'guid': ID.unique(),
                'user_id': user_ids[i // 5],  # Same user for 5 records
                'platform': "Facebook",
                'post_count': random.randint(1, 100),
                'like_count': random.randint(0, 5000),
                'groups': json.dumps({"associated_grps": entity_list}),
                'follower_count': random.randint(0, 10000),
                'top_interests': json.dumps({"keywords": keywords}),
                'last_active': (datetime.now() - timedelta(days=random.randint(0, 30))).isoformat()
            }
            
            # Create the document in Appwrite
            result = databases.create_document(
                database_id=fireSaleDb['$id'],
                collection_id=socialMediaCollection['$id'],
                document_id=ID.unique(),
                data=document_data
            )
            
            success_count += 1
            if success_count % 100 == 0:  # Print progress every 1000 records
                print(f"Processed {success_count} records...")
                
        except Exception as e:
            error_count += 1
            print(f"Error processing record {i}: {str(e)}")
            continue
    
    print(f"\nUpload completed with {success_count} successes and {error_count} errors.")

In [42]:
# Load your CSV data
facebook_ad_df = pd.read_csv('dataset/synthetic_facebook_ad_data.csv')

# Upload the data
upload_social_media_data(facebook_ad_df, 5000)

Error processing record 40: the JSON object must be str, bytes or bytearray, not float
Error processing record 50: the JSON object must be str, bytes or bytearray, not float
Error processing record 55: the JSON object must be str, bytes or bytearray, not float
Error processing record 91: the JSON object must be str, bytes or bytearray, not float
Processed 100 records...
Error processing record 124: the JSON object must be str, bytes or bytearray, not float
Processed 200 records...
Error processing record 222: the JSON object must be str, bytes or bytearray, not float
Error processing record 305: the JSON object must be str, bytes or bytearray, not float
Processed 300 records...
Error processing record 341: the JSON object must be str, bytes or bytearray, not float
Error processing record 356: the JSON object must be str, bytes or bytearray, not float
Processed 400 records...
Error processing record 418: the JSON object must be str, bytes or bytearray, not float
Error processing record 

#### Link user profile to the linkedIn data.

In [43]:
import pandas as pd

# Read the CSV file
linkedin_user_df = pd.read_csv('dataset/linkedin_user_profile.csv')
print(linkedin_user_df.head())
print("\nColumn headers:")
print(linkedin_user_df.columns.tolist())
print("Total number of rows:", linkedin_user_df.shape[0])


            current_company_name  \
0                            NaN   
1               Gemeente Utrecht   
2                   Network Rail   
3                      Freelance   
4  Mid-Range Computer Group Inc.   

                                            position  \
0  Snr Business Analyst at Emploi et Développemen...   
1  Communicatieadviseur Corporate & Strategie Gem...   
2               Network Data Manager at Network Rail   
3                             Architetto (Freelance)   
4  Senior Account Executive at Mid-Range Computer...   

                                               about  \
0                                                NaN   
1  Allround Marketing & Communicatie Adviseur met...   
2  Experienced Data Manager with a demonstrated h...   
3                                                NaN   
4                                                NaN   

                                               posts  \
0  [{"attribution":"Liked by Catherine Fitzpatric... 