In [3]:
import requests
import json

def fetch_data(api_url, token, page_size=1000, max_pages=10):
    all_data = []  # To store the combined data from all pages
    seen_ids = set()  # To track already fetched post IDs
    headers = {"Flic-Token": token}  # Authorization header
    
    for page in range(1, max_pages + 1):
        try:
            # Construct the URL with the current page number
            url = f"{api_url}?page={page}&page_size={page_size}"
            
            # Make the API request
            response = requests.get(url, headers=headers)
            
            # Check for successful response
            if response.status_code != 200:
                print(f"Failed to fetch page {page}: Status code {response.status_code}")
                break
            
            # Parse the JSON response
            data = response.json()
            
            # Assuming the posts have unique IDs in a key like 'id'
            page_data = [post for post in data.get('posts', []) if post['id'] not in seen_ids]
            
            # Add new data to the all_data list
            all_data.extend(page_data)
            
            # Update the seen_ids set
            seen_ids.update(post['id'] for post in page_data)
            
            # Stop if no new data is fetched (last page reached)
            if not page_data:
                print("No more data to fetch.")
                break
        except Exception as e:
            print(f"An error occurred on page {page}: {e}")
            break

    return all_data

# Define the API URL and token
api_url = "https://api.socialverseapp.com/posts/view"
api_token = "flic_6e2d8d25dc29a4ddd382c2383a903cf4a688d1a117f6eb43b35a1e7fadbb84b8"

# Fetch data
fetched_data = fetch_data(api_url, api_token)

# Save the data to a JSON file
with open('fetched_data.json', 'w') as json_file:
    json.dump(fetched_data, json_file, indent=4)

print(f"Fetched {len(fetched_data)} posts.")


No more data to fetch.
Fetched 225 posts.


In [4]:
import requests
import json

def fetch_data(api_url, token, page_size=1000, max_pages=10):
    all_data = []
    seen_ids = set()
    headers = {"Flic-Token": token}
    
    for page in range(1, max_pages + 1):
        try:
            url = f"{api_url}?page={page}&page_size={page_size}"
            response = requests.get(url, headers=headers)
            if response.status_code != 200:
                print(f"Failed to fetch page {page}: Status code {response.status_code}")
                break
            data = response.json()
            page_data = [post for post in data.get('posts', []) if post['id'] not in seen_ids]
            all_data.extend(page_data)
            seen_ids.update(post['id'] for post in page_data)
            if not page_data:
                print("No more data to fetch.")
                break
        except Exception as e:
            print(f"An error occurred on page {page}: {e}")
            break
    return all_data

api_token = "flic_6e2d8d25dc29a4ddd382c2383a903cf4a688d1a117f6eb43b35a1e7fadbb84b8"
api_urls = {
    "viewed_posts": "https://api.socialverseapp.com/posts/view",
    "liked_posts": "https://api.socialverseapp.com/posts/like",
    "inspired_posts": "https://api.socialverseapp.com/posts/inspire",
    "rated_posts": "https://api.socialverseapp.com/posts/rating",
    "all_posts": "https://api.socialverseapp.com/posts/summary/get",
    "all_users": "https://api.socialverseapp.com/users/get_all"
}

for name, url in api_urls.items():
    print(f"Fetching data for {name}...")
    fetched_data = fetch_data(url, api_token)
    output_file = f"{name}_data.json"
    with open(output_file, 'w') as json_file:
        json.dump(fetched_data, json_file, indent=4)
    print(f"Fetched {len(fetched_data)} items for {name}. Data saved to {output_file}.")


Fetching data for viewed_posts...
No more data to fetch.
Fetched 225 items for viewed_posts. Data saved to viewed_posts_data.json.
Fetching data for liked_posts...
No more data to fetch.
Fetched 64 items for liked_posts. Data saved to liked_posts_data.json.
Fetching data for inspired_posts...
No more data to fetch.
Fetched 30 items for inspired_posts. Data saved to inspired_posts_data.json.
Fetching data for rated_posts...
No more data to fetch.
Fetched 210 items for rated_posts. Data saved to rated_posts_data.json.
Fetching data for all_posts...
No more data to fetch.
Fetched 1140 items for all_posts. Data saved to all_posts_data.json.
Fetching data for all_users...
No more data to fetch.
Fetched 0 items for all_users. Data saved to all_users_data.json.


In [5]:
import pandas as pd
file_path = 'viewed_posts_data.json'
df = pd.read_json(file_path)
print(df.head())

     id                                           category  \
0  1382  {'id': 8, 'name': 'Bloom Scroll', 'count': 58,...   
1  1026  {'id': 2, 'name': 'Vible', 'count': 535, 'desc...   
2  1353  {'id': 8, 'name': 'Bloom Scroll', 'count': 58,...   
3  1363  {'id': 8, 'name': 'Bloom Scroll', 'count': 58,...   
4  2372  {'id': 11, 'name': 'Stop Scrolling', 'count': ...   

                                       slug  \
0  268bb7f9859c35868e35b1da17fbf03ce72cf12b   
1  e83989cff19e2dfc1fb7c359fcff4d784ed2fe49   
2  fea6118d73a756c12f254c18c7997873f5a7425a   
3  772922863fe980820a9c0dd719752f8df7ae6153   
4  49c0a9a45c3d66ce31cba5db374fccb533f0532d   

                                               title identifier  \
0  🐐 A mountain encounter with the king of the Alps.    _xtoUMf   
1  Do you believe in Jesus Christ Not everyone sa...    Wud4z0o   
2  Not one, but two firsts for me on this one The...    hU83THA   
3  Forest 🌳 is such an incredible sanctuary for r...    rAetjFY   
4        

In [7]:
from pandas import json_normalize

# Load raw JSON and normalize it
import json
with open(file_path, 'r') as file:
    data = json.load(file)

df = pd.json_normalize(data)
df.head()

Unnamed: 0,id,slug,title,identifier,comment_count,upvote_count,view_count,exit_count,rating_count,average_rating,...,post_summary,category.id,category.name,category.count,category.description,category.image_url,baseToken.address,baseToken.name,baseToken.symbol,baseToken.image_url
0,1382,268bb7f9859c35868e35b1da17fbf03ce72cf12b,🐐 A mountain encounter with the king of the Alps.,_xtoUMf,0,1,29,469,3,23,...,[],8,Bloom Scroll,58,Scroll until you bloom,https://socialverse-assets.s3.us-east-1.amazon...,,,,
1,1026,e83989cff19e2dfc1fb7c359fcff4d784ed2fe49,Do you believe in Jesus Christ Not everyone sa...,Wud4z0o,0,50,83,0,1,100,...,[],2,Vible,535,All the best vibes!,https://assets.socialverseapp.com/categories/a...,,,,
2,1353,fea6118d73a756c12f254c18c7997873f5a7425a,"Not one, but two firsts for me on this one The...",hU83THA,0,24,82,185,2,35,...,[],8,Bloom Scroll,58,Scroll until you bloom,https://socialverse-assets.s3.us-east-1.amazon...,,,,
3,1363,772922863fe980820a9c0dd719752f8df7ae6153,Forest 🌳 is such an incredible sanctuary for r...,rAetjFY,0,8,107,92,3,33,...,[],8,Bloom Scroll,58,Scroll until you bloom,https://socialverse-assets.s3.us-east-1.amazon...,,,,
4,2372,49c0a9a45c3d66ce31cba5db374fccb533f0532d,post 11,puRM1er,0,0,6,434,0,97,...,[],11,Stop Scrolling,1,Scroll with the goal of stopping,https://assets.socialverseapp.com/categories/d...,,,,


In [9]:
df = pd.json_normalize(data, sep='_')  
df.columns

Index(['id', 'slug', 'title', 'identifier', 'comment_count', 'upvote_count',
       'view_count', 'exit_count', 'rating_count', 'average_rating',
       'share_count', 'video_link', 'contract_address', 'chain_id',
       'chart_url', 'is_locked', 'created_at', 'first_name', 'last_name',
       'username', 'upvoted', 'bookmarked', 'thumbnail_url',
       'gif_thumbnail_url', 'following', 'picture_url', 'post_summary',
       'category_id', 'category_name', 'category_count',
       'category_description', 'category_image_url', 'baseToken_address',
       'baseToken_name', 'baseToken_symbol', 'baseToken_image_url'],
      dtype='object')

In [10]:
df['engagement_score'] = df['view_count'] + 2 * df['upvote_count'] + 3 * df['share_count']

In [12]:
category_analysis = df.groupby('category_name').agg({
    'view_count': 'sum',
    'upvote_count': 'sum',
    'share_count': 'sum'
})
category_analysis


Unnamed: 0_level_0,view_count,upvote_count,share_count
category_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Bloom Scroll,3136,563,70
Bot,19,0,0
E/ACC,853,343,0
Flic,577,155,0
InstaRama,69,110,0
Pumptok,657,220,0
SolTok,4014,1325,15
Stop Scrolling,6,0,0
Super Feed,671,0,1
Vible,5559,2928,58


In [13]:
directory = '/Users/gauravsingh/VideoRecommendation/dataset'

In [14]:
import os
# Initialize an empty DataFrame
combined_data = pd.DataFrame()

# Iterate over all JSON files
for filename in os.listdir(directory):
    if filename.endswith('.json'):
        filepath = os.path.join(directory, filename)
        with open(filepath, 'r') as file:
            # Load JSON data
            data = json.load(file)
            # Normalize and convert to DataFrame
            df = pd.json_normalize(data, sep='_')
            # Append to the combined DataFrame
            combined_data = pd.concat([combined_data, df], ignore_index=True)


In [15]:
# Drop duplicates based on unique identifiers
combined_data.drop_duplicates(subset='id', inplace=True)

In [16]:
# Check for missing values
print(combined_data.isnull().sum())

# Fill or drop missing values based on requirements
combined_data.fillna({'view_count': 0, 'upvote_count': 0, 'share_count': 0}, inplace=True)

id                                                        0
slug                                                      0
title                                                     0
identifier                                                0
comment_count                                             0
                                                       ... 
post_summary_entities_caregiver_role                   1167
post_summary_entities_baby_role                        1167
post_summary_topics_of_video_caregiver interactions    1167
post_summary_psycological_view_of_video_caregiver      1167
post_summary_entities_main_speaker_age_group           1167
Length: 1905, dtype: int64


In [17]:
# Convert timestamp to datetime
combined_data['created_at'] = pd.to_datetime(combined_data['created_at'], unit='ms')

# Convert numerical columns
numeric_columns = ['view_count', 'upvote_count', 'share_count', 'exit_count', 'rating_count', 'average_rating']
combined_data[numeric_columns] = combined_data[numeric_columns].apply(pd.to_numeric)

In [18]:
combined_data['engagement_score'] = (
    combined_data['view_count'] +
    2 * combined_data['upvote_count'] +
    3 * combined_data['share_count']
)


In [19]:
combined_data['day_of_week'] = combined_data['created_at'].dt.day_name()


In [20]:
final_data = combined_data[[
    'id', 'title', 'category_name', 'username', 'view_count',
    'upvote_count', 'share_count', 'engagement_score', 'created_at'
]]


In [21]:
final_data.to_csv('processed_posts_data.csv', index=False)
