In [37]:
import os
import pandas as pd
import numpy as np
import heapq

N_SPLIT = 10
ROOT_PATH = '/Users/ruby0322/Projects/112-1/IRTM/term-project/irtm-final-project'

In [67]:
def read_network(input_filepath) -> dict[list[str]]:
    G = {}
    with open(input_filepath) as file:
        while (line := file.readline()):
            if line:
                n1, n2 = line.split()
                if n1 in G:
                    G[n1].append(n2)
                else:
                    G[n1] = [n2]
                if n2 in G:
                    G[n2].append(n1)
                else:
                    G[n2] = [n1]
    return G

def dijkstra_unweighted(G, start):
    """
    Compute shortest paths from the start node to all other nodes in an unweighted graph.

    :param G: A dictionary representing the adjacency list of the graph. 
                  Each key is a node, and its value is a list of its neighbors.
    :param start: The starting node
    :return: A dictionary of shortest distances from the start node to each other node.
    """
    # Initialize distances as infinity and distance to start node as 0
    distances = {node: float('infinity') for node in G}
    distances[start] = 0

    # Priority queue to hold nodes and their current distances
    pq = [(0, start)]

    while pq:
        current_distance, current_node = heapq.heappop(pq)

        # Explore neighbors
        for neighbor in G[current_node]:
            distance = current_distance + 1  # Each edge has a weight of 1

            # Update distance if a shorter path is found
            if distance < distances[neighbor]:
                distances[neighbor] = distance
                heapq.heappush(pq, (distance, neighbor))

    return distances

def get_subnetwork(G, mu, k=1):
    distances = dijkstra_unweighted(G, mu)
    return [list(filter(lambda node: distances[node] == dep, distances.keys())) for dep in range(k)]

def split_reviews(input_filepath, output_folder) -> None:
    with open(input_filepath) as file:
        s = file.read()
        s = s.split('\n')
        slen = len(s)
        n_reviews = (slen // N_SPLIT) + 1
        for i in range(N_SPLIT):
            ending = slen if i == N_SPLIT - 1 else n_reviews*(i+1)
            ss = s[n_reviews*i:ending]
            print(f'[split-reviews] Parsing reviews split {i}...')
            reviews = dict()
            exec('\n'.join(ss), { 'reviews': reviews })
            print(reviews)
            print(f'[split-reviews] Saving reviews split {i} into "reviews-{i}.csv"...')
            pd.DataFrame(reviews).transpose().to_csv(f'{output_folder}/reviews-{i}.csv', index=False)            

def iterate_over_split_reviews(func):
    """
    func: a function that returns a list of boolean that indexes the reviews
    """
    aggr = pd.DataFrame()
    for i in range(N_SPLIT):
        df = pd.read_csv(f'./reviews/reviews-{i}.csv')
        df = df[func(df)]
        aggr = pd.concat([aggr, df])
    return aggr

def get_network_reviews(network: list[str]) -> pd.DataFrame:
    """
    network: list of users in the network
    """
    def f(df: pd.DataFrame):
        return df['user'].isin(network)
    return iterate_over_split_reviews(f)

def get_subnetwork_reviews_by_mu(G, mu, k=1):
    subnetwork = get_subnetwork(G, mu, k)
    for node in subnetwork:
        node['user']
        node['depth']


In [88]:
def main():
    os.chdir(ROOT_PATH)
    # split_reviews('./raw-data/reviews.txt', 'reviews')
    G = read_network(f'./raw-data/edges.txt')
    # print(get_network_reviews(G['slash'] + ['slash']))
    # print(pd.DataFrame(list(get_subnetwork(G, 'carterchristian1', 4).items())))
    print(get_subnetwork(G, 'slash', 2))

In [89]:
if __name__ == '__main__':
    main()

            user  depth
0           Rodo      1
1         anehan      2
2      sevilemar      2
3         dingsi      2
4          slash      0
..           ...    ...
362  windlegends      2
363    Zimraphel      2
364   Ranunculus      2
365         ncla      2
366   TackyJulie      2

[367 rows x 2 columns]
