# This script processes a citation network text file 
And builds a dictionary mapping each paper's unique `#index` to a list of paper indices it references (i.e., citations). It includes logic to ensure that only papers with authors listed are considered, and references (`#%`) are only counted for such papers. The final mapping is saved in mapping_idx.pkl file


In [1]:
import pickle

def parse_txt_to_dict_streaming(input_file, output_file):
    data_dict = {}
    current_refs = []
    has_authors = False
    current_index = None

    with open(input_file, 'r', encoding='utf-8') as f:
        next(f)  # skip first line

        for line in f:
            line = line.strip()
            if line.startswith('#*'):
                current_refs = []
                has_authors = False
                current_index = None
            elif line.startswith('#@'):
                if line != '#@':
                    has_authors = True
            elif line.startswith('#%'):
                if has_authors:
                    ref = line[2:].strip()
                    if ref.isdigit():
                        current_refs.append(int(ref))
            elif line.startswith('#index'):
                if has_authors:
                    current_index = int(line[6:].strip())
                    data_dict[current_index] = current_refs

                    # Optional: periodically dump to disk to free memory
                    if len(data_dict) % 100000 == 0:
                        print(f"Processed {len(data_dict)} entries")

    # Save the final dictionary
    with open(output_file, 'wb') as f:
        pickle.dump(data_dict, f, protocol=pickle.HIGHEST_PROTOCOL)

    print(f"Saved {len(data_dict)} entries to {output_file}")

parse_txt_to_dict_streaming('citation-network2.txt', 'mapping_idx.pkl')


Processed 100000 entries
Processed 200000 entries
Processed 300000 entries
Processed 400000 entries
Processed 500000 entries
Processed 600000 entries
Processed 700000 entries
Processed 800000 entries
Processed 900000 entries
Processed 1000000 entries
Processed 1100000 entries
Processed 1200000 entries
Processed 1300000 entries
Saved 1329989 entries to mapping_idx.pkl


In [2]:
import pickle

# Load the dictionary from the pickle file
with open('mapping_idx.pkl', 'rb') as f:
    data_dict = pickle.load(f)

# Print top 10 items (key-value pairs)
for i, (key, value) in enumerate(data_dict.items()):
    print(f"{key}: {value}")
    if i == 9:
        break

0: []
1: [774794, 95940]
3: [858446, 435642, 1293715, 412124, 414766, 1301929, 1102537, 102223]
4: []
5: []
6: []
7: []
8: [378882]
9: [684494, 439170, 495494, 794270, 911369, 800580, 376413, 102495, 511257, 423180, 402729]
10: []


# Remove entries with empty value lists

In [3]:
import pickle

# Load the dictionary
with open("mapping_idx.pkl", "rb") as f:
    mapping_idx = pickle.load(f)

# Remove entries with empty value lists
filtered_mapping = {k: v for k, v in mapping_idx.items() if v}

# Save the filtered dictionary
with open("mapping_idx_filtered.pkl", "wb") as f:
    pickle.dump(filtered_mapping, f)

print(f"Filtered dictionary saved with {len(filtered_mapping)} entries.")


Filtered dictionary saved with 386497 entries.


In [4]:
# Print first 10 entries for verification
for i, (k, v) in enumerate(filtered_mapping.items()):
    print(f"{k}: {v}")
    if i == 9:
        break

1: [774794, 95940]
3: [858446, 435642, 1293715, 412124, 414766, 1301929, 1102537, 102223]
8: [378882]
9: [684494, 439170, 495494, 794270, 911369, 800580, 376413, 102495, 511257, 423180, 402729]
13: [1284256, 407367, 1294040, 429402, 354224, 124237, 290792]
24: [1288569, 806859, 628820, 636684, 1300010, 301407]
25: [629319, 123113, 775152, 647357, 622012, 894102, 775711, 645080]
27: [5032]
31: [166201, 988854, 109276, 783542, 124393, 292880, 152144, 392657, 434387, 311159, 106860]
34: [406408, 301680, 950201]


**processes citation network text file and creates a dictionary mapping each paper's unique `#index` to its list of authors from the `#@` field. Only entries with non-empty author fields are considered. The resulting dictionary is saved to a `author_mapping.pkl` file for later use.**


In [5]:
import pickle

def parse_authors_to_dict(file_path):
    data_dict = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Ignore the first line
    lines = lines[1:]

    current_authors = None
    current_index = None

    for line in lines:
        line = line.strip()

        if line.startswith("#@"):
            current_authors = line[2:].strip()
        elif line.startswith("#index"):
            current_index = int(line[6:].strip())
            if current_authors:  # Only if authors exist
                data_dict[current_index] = current_authors
            # Reset for next record
            current_authors = None
            current_index = None

    return data_dict

# File path to your input text file
input_file = "citation-network2.txt"  
output_file = "author_mapping.pkl"

# Create dictionary and save to .pkl
author_dict = parse_authors_to_dict(input_file)
with open(output_file, 'wb') as f:
    pickle.dump(author_dict, f)

print(f"Saved {len(author_dict)} records to '{output_file}'")


Saved 1329989 records to 'author_mapping.pkl'


In [6]:
import pickle

# Load the dictionary from the pickle file
with open('author_mapping.pkl', 'rb') as f:
    data_dict = pickle.load(f)

# Print top 10 items (key-value pairs)
for i, (key, value) in enumerate(data_dict.items()):
    print(f"{key}: {value}")
    if i == 9:
        break


0: E. S. Cho,C. J. Kim,S. D. Kim,S. Y. Rhew
1: Lori M. Weber,Alysha Loumakis,James Bergman
3: Choong-Gyoo Lim
4: Jose Maria Perez,Felix Garcia,Jesus Carretero,Alejandro Calderon,Luis Miguel Sanchez
5: Jean Kumagai
6: Marek Rusinkiewicz,Dimitrios Georgakopoulos
7: Barton C. Massey,Evan Tick
8: Jan Ramon
9: Therapon Skotiniotis,Ji-en Morris Chang
10: V. Martin,K. Schwan


**Filters an existing author-to-paper mapping (`author_mapping.pkl`) using a secondary file (`AMiner-Author.txt`) that contains detailed author metadata including affiliation. Only authors who have non-empty affiliations (`#a`) are retained in the final dictionary. The result is saved to a new `remove_author_#a.pkl` file.**


In [7]:
import pickle

# Load author mapping dictionary
with open("author_mapping.pkl", "rb") as f:
    author_mapping = pickle.load(f)

# Path to the large .txt file containing paper and author info
txt_file_path = "AMiner-Author.txt"

# Preprocessing:
# We'll parse the txt file once and build a mapping: author_name -> affiliation (empty if none)
author_affiliation = {}

with open(txt_file_path, "r", encoding="utf-8") as f:
    current_author = None
    current_affiliation = None
    for line in f:
        line = line.strip()
        if line.startswith("#index"):
            # reset for new paper
            current_author = None
            current_affiliation = None
        elif line.startswith("#n "):
            current_author = line[3:].strip()
        elif line.startswith("#a "):
            current_affiliation = line[3:].strip()
            if current_author is not None:
                author_affiliation[current_author] = current_affiliation

# Now filter authors from author_mapping using author_affiliation
filtered_author_mapping = {}

for key, authors_str in author_mapping.items():
    authors = [a.strip() for a in authors_str.split(",") if a.strip()]
    filtered_authors = []
    for author in authors:
        aff = author_affiliation.get(author)
        if aff and aff.strip():
            filtered_authors.append(author)
    if filtered_authors:
        filtered_author_mapping[key] = ", ".join(filtered_authors)
    # else key is skipped (all authors removed)

# Save filtered author mapping
with open("remove_author_#a.pkl", "wb") as f:
    pickle.dump(filtered_author_mapping, f)

print(f"Filtered author mapping saved with {len(filtered_author_mapping)} entries.")


Filtered author mapping saved with 1036217 entries.


In [8]:
# Print first 10 entries for verification
for i, (k, v) in enumerate(filtered_author_mapping.items()):
    print(f"{k}: {v}")
    if i == 9:
        break

0: C. J. Kim, S. D. Kim
1: Lori M. Weber, Alysha Loumakis, James Bergman
3: Choong-Gyoo Lim
4: Felix Garcia, Jesus Carretero, Luis Miguel Sanchez
5: Jean Kumagai
6: Marek Rusinkiewicz
7: Evan Tick
8: Jan Ramon
9: Therapon Skotiniotis, Ji-en Morris Chang
10: V. Martin, K. Schwan


**performs a final filtering step by intersecting a paper-to-author index mapping (`mapping_idx_filtered.pkl`) with a filtered author list (`remove_author_#a.pkl`) that contains only authors with valid affiliations. The resulting dictionary keeps only those entries where both the paper and the authors are validated, and saves it as `final_mapping.pkl`.**


In [9]:
import pickle

# Load the dictionaries
with open("mapping_idx_filtered.pkl", "rb") as f:
    mapping_idx_filtered = pickle.load(f)

with open("remove_author_#a.pkl", "rb") as f:
    remove_author_a = pickle.load(f)

# Filter the dictionary
final_mapping = {}

for key, authors in mapping_idx_filtered.items():
    # Keep key only if it's present in remove_author_a (as a key)
    if key not in remove_author_a:
        continue

    # Filter author list to only those present in remove_author_a
    filtered_authors = [author for author in authors if author in remove_author_a]

    # If filtered list is not empty, keep it
    if filtered_authors:
        final_mapping[key] = filtered_authors

# Save the final filtered mapping
with open("final_mapping.pkl", "wb") as f:
    pickle.dump(final_mapping, f)

print(f"Final mapping saved with {len(final_mapping)} keys.")


Final mapping saved with 348882 keys.


In [10]:
for i, (k, v) in enumerate(final_mapping.items()):
    print(f"{k}: {v}")
    if i == 9:
        break
    
# print(data_dict[774794])
print(final_mapping[108893])

1: [774794, 95940]
3: [858446, 435642, 412124, 414766, 1301929, 1102537, 102223]
8: [378882]
9: [684494, 439170, 495494, 800580, 102495, 511257, 402729]
13: [407367, 1294040, 429402, 354224, 124237, 290792]
24: [1288569, 806859, 628820, 636684, 1300010, 301407]
25: [629319, 123113, 775152, 647357, 622012, 775711, 645080]
27: [5032]
31: [109276, 783542, 124393, 292880, 392657, 434387, 106860]
34: [406408, 301680, 950201]
[102229, 512523, 401309, 158212, 118593, 498294, 311269, 685389]


**performs a Depth-First Search (DFS) traversal on a citation graph stored in `final_mapping.pkl`, starting from each of the first `max_nodes` papers. It collects all reachable papers (multi-hop references) and stores the result in `multi_hop_first_11000.pkl`. This helps in understanding multi-hop citation chains across papers.**


In [11]:
import pickle

# Load the input graph
with open("final_mapping.pkl", "rb") as f:
    graph = pickle.load(f)

# Parameters
max_nodes = 11000  # Only process first max_nodes(<22000) nodes
print_interval = 1000  # Print progress every 1000 nodes
output_file = "multi_hop_first_11000.pkl"  # Single output file

results = {}
processed = 0

for start_key in list(graph.keys())[:max_nodes]:  # Only take first max_nodes(<22000) keys
    visited = set()
    stack = [start_key]
    result = set()

    while stack:
        node = stack.pop()
        if node in visited:
            continue
        visited.add(node)
        if node != start_key:
            result.add(node)

        for neighbor in graph.get(node, []):
            if neighbor not in visited:
                stack.append(neighbor)

    results[start_key] = sorted(result)
    processed += 1

    # Print progress
    if processed % print_interval == 0:
        print(f"Processed {processed} keys...")
    
    # Early exit if we've reached our limit
    if processed >= max_nodes:
        break

# Save all results to a single file
with open(output_file, "wb") as f:
    pickle.dump(results, f, protocol=pickle.HIGHEST_PROTOCOL)

print(f"\nDFS completed for {processed} keys. Results saved to '{output_file}'")
print(f"Total papers processed: {len(results)}")
print(f"Example result for first key: {list(results.items())[0] if results else 'No results'}")

Processed 1000 keys...
Processed 2000 keys...
Processed 3000 keys...
Processed 4000 keys...
Processed 5000 keys...
Processed 6000 keys...
Processed 7000 keys...
Processed 8000 keys...
Processed 9000 keys...
Processed 10000 keys...
Processed 11000 keys...

DFS completed for 11000 keys. Results saved to 'multi_hop_first_11000.pkl'
Total papers processed: 11000
Example result for first key: (1, [95940, 774794])


In [12]:
import pickle
with open("multi_hop_first_11000.pkl", "rb") as f:
    test = pickle.load(f)

# for i, (k, v) in enumerate(test.items()):
#     print(f"{k}: {v}")
#     if k == 9:
#         break
    
# print(data_dict[774794])
# print(test[1])

# Ensures a complete mapping of paper IDs to their first authors by:

1. **Loading existing mappings** from `mapping_idx.pkl`, which maps paper IDs to author strings.
2. **Extracting and storing the first author** from each author string.
3. **Parsing the original `citation-network2.txt` file** to find additional paper IDs not in the initial mapping.
4. **For each new paper**, it scans backwards from `#index` to find its `#@` author line and extracts the first author.
5. **Updates the mapping** with these new entries, resulting in a more complete `id_author_mapping`.

This is useful for ensuring every paper (with authors listed) has a known first author for further analysis.


In [13]:
import pickle

# Step 1: Load mapping_idx.pkl and extract first authors
with open("mapping_idx.pkl", "rb") as f:
    mapping_data = pickle.load(f)

id_author_mapping = {}

# Extract first author from existing mapping
for paper_id, authors in mapping_data.items():
    if isinstance(authors, str) and authors.strip():
        first_author = authors.split(",")[0].strip()
        id_author_mapping[int(paper_id)] = first_author

# Step 2: Open citation-network2.txt and add new IDs with first authors if not already present
with open("citation-network2.txt", "r", encoding="utf-8") as f:
    lines = f.readlines()

i = 0
while i < len(lines):
    line = lines[i].strip()

    if line.startswith("#index"):
        paper_id = int(line.replace("#index", "").strip())

        # Check if ID already exists
        if paper_id not in id_author_mapping:
            # Look for #@ line (authors)
            j = i - 1
            first_author = None

            # Walk backwards to find #@ line for current paper
            while j >= 0:
                author_line = lines[j].strip()
                if author_line.startswith("#@"):
                    author_data = author_line.replace("#@", "").strip()
                    if author_data:
                        first_author = author_data.split(",")[0].strip()
                    break
                j -= 1

            # Add if valid first author found
            if first_author:
                id_author_mapping[paper_id] = first_author

    i += 1

#  Final Output
print(" Total IDs collected:", len(id_author_mapping))



 Total IDs collected: 1329989


In [14]:
for i, (k, v) in enumerate(id_author_mapping.items()):
    print(f"{k}: {v}")
    if i == 9:
        break

import pickle

# Save the dictionary to id_author_mapping.pkl
with open("id_author_mapping.pkl", "wb") as f:
    pickle.dump(id_author_mapping, f)

print("id_author_mapping has been saved to id_author_mapping.pkl")


0: E. S. Cho
1: Lori M. Weber
3: Choong-Gyoo Lim
4: Jose Maria Perez
5: Jean Kumagai
6: Marek Rusinkiewicz
7: Barton C. Massey
8: Jan Ramon
9: Therapon Skotiniotis
10: V. Martin
id_author_mapping has been saved to id_author_mapping.pkl


# Processes multi-hop citation data and converts paper IDs into author names.

**Workflow:**
1. Loads a paper ID → author name mapping (`id_author_mapping.pkl`).
2. Loads a dictionary (`multi_hop_first_11000.pkl`) mapping each paper ID to a list of reachable paper IDs (multi-hop citations).
3. For each paper:
   - Maps the source paper and reachable papers to author names.
   - Excludes self-citations (same author).
   - Gathers a list of *unique* cited authors.
4. Saves the final mapping: `source_author → list of unique reachable authors` into `multi_hop_author_name_11000.pkl`.

**Output:** A cleaned author-level multi-hop influence map, filtering out missing or self-cited authors.


In [1]:
import pickle

# Load id to author name mapping
with open("id_author_mapping.pkl", "rb") as f:
    id_author_mapping = pickle.load(f)

# Single file to process (change this to your target file)
input_file = "multi_hop_first_11000.pkl"  
output_file = "multi_hop_author_name_11000.pkl"

print(f"Processing {input_file}...")

# Load the multi-hop data
with open(input_file, "rb") as f:
    multi_hop_data = pickle.load(f)

author_hop_data = {}

for paper_id, reachable_papers in multi_hop_data.items():
    # Skip if paper_id has no author mapping
    if paper_id not in id_author_mapping:
        continue
    
    source_author = id_author_mapping[paper_id]
    unique_authors = set()
    
    for cited_paper in reachable_papers:
        if cited_paper in id_author_mapping:
            cited_author = id_author_mapping[cited_paper]
            # Avoid self-citations
            if cited_author != source_author:
                unique_authors.add(cited_author)
    
    if unique_authors:
        author_hop_data[source_author] = list(unique_authors)

# Save the results
with open(output_file, "wb") as f:
    pickle.dump(author_hop_data, f)

print(f"Processing complete. Results saved to {output_file}")
print(f"Total authors processed: {len(author_hop_data)}")
print(f"Example mapping: {list(author_hop_data.items())[0] if author_hop_data else 'No results'}")

Processing multi_hop_first_11000.pkl...
Processing complete. Results saved to multi_hop_author_name_11000.pkl
Total authors processed: 9967
Example mapping: ('Lori M. Weber', ['James C. Witte', 'John P. Robinson'])


# Filters an author-level multi-hop citation influence map using two criteria:

- **Threshold on number of reachable authors**: Keeps only authors citing more than `threshold` other authors.
- **Fairness constraint**: Keeps only authors whose fairness score exceeds `fairness_threshold`.

**Inputs:**
- `multi_hop_author_name_11000.pkl`: Author-to-multi-hop-author map.
- `fairness_values.pkl`: Dictionary mapping author names to fairness scores.

**Output:**
- `filtered_author_hop_k(10)(.1).pkl`: Filtered author influence map.

**Useful Stats Printed:**
- Total original authors.
- Total authors after filtering.
- Number of authors removed.
- Example filtered entries.


In [2]:
import pickle

# Configuration
input_file = "multi_hop_author_name_11000.pkl"
fairness_file = "fairness_values.pkl"
output_file = "filtered_author_hop_k(10)(.1).pkl"
threshold = 10
fairness_threshold = 0.1

print(f"\n=== Filtering with threshold = {threshold} and fairness >= {fairness_threshold} ===")

# Load multi-hop author data
with open(input_file, "rb") as f:
    data = pickle.load(f)

# Load fairness values
with open(fairness_file, "rb") as f:
    fairness_scores = pickle.load(f)

# Filter step: apply both conditions
filtered_data = {
    key: val for key, val in data.items()
    if len(val) > threshold and fairness_scores.get(key, 0) > fairness_threshold
}

# Calculate stats
original_count = len(data)
filtered_count = len(filtered_data)
removed_count = original_count - filtered_count

# Save the filtered dictionary
with open(output_file, "wb") as f:
    pickle.dump(filtered_data, f)

# Print results
print(f"\nProcessing results for {input_file}:")
print(f"Original keys: {original_count}")
print(f"Filtered keys: {filtered_count}")
print(f"Keys removed: {removed_count}")
print(f"\nFiltered data saved to: {output_file}")
# print(f"Example of kept entries: {list(filtered_data.items())[:2] if filtered_data else 'None'}")



=== Filtering with threshold = 10 and fairness >= 0.1 ===

Processing results for multi_hop_author_name_11000.pkl:
Original keys: 9967
Filtered keys: 5897
Keys removed: 4070

Filtered data saved to: filtered_author_hop_k(10)(.1).pkl


In [3]:
import pickle

# Path to your single .pkl file
pkl_file = "filtered_author_hop_k(10)(.1).pkl"

unique_keys = set()
unique_values = set()

with open(pkl_file, "rb") as f:
    data = pickle.load(f)

unique_keys.update(data.keys())
for v_set in data.values():
    unique_values.update(v_set)

print(f"Total unique keys: {len(unique_keys)}")
print(f"Total unique values: {len(unique_values)}")


Total unique keys: 5897
Total unique values: 71532


# Assigns a unique integer ID to each author in the filtered multi-hop influence graph.

**Purpose:**
- Converts author names (keys) into unique integer IDs starting from 1.

**Inputs:**
- `filtered_author_hop_k(10)(.1).pkl`: A pickle file containing a dictionary of author-to-author influence data.

**Output:**
- `author_keys_to_id_mapping.pkl`: A dictionary mapping author names to unique integer IDs.

**Behavior:**
- Loads the filtered author influence map.
- Iterates over each author and assigns a unique ID.
- Saves the mapping as a `author_keys_to_id_mapping.pkl` file.



In [4]:
import pickle

# Path to single .pkl file
pkl_file = "filtered_author_hop_k(10)(.1).pkl"

author_to_id = {}
current_id = 1

# Load the single pickle file
with open(pkl_file, "rb") as f:
    data = pickle.load(f)

# Assign unique IDs to each author (key)
for key in data.keys():
    if key not in author_to_id:
        author_to_id[key] = current_id
        current_id += 1

print(f"Total unique author keys: {len(author_to_id)}")

# Save to file
with open("author_keys_to_id_mapping.pkl", "wb") as f:
    pickle.dump(author_to_id, f)

# Print first 10 entries
print("\n First 10 author-key-to-ID mappings:")
for i, (author, idx) in enumerate(author_to_id.items()):
    # print(f"{author}: {idx}")
    if idx==5921:
        print(author)
        break


Total unique author keys: 5897

 First 10 author-key-to-ID mappings:


In [5]:
for i, (author, idx) in enumerate(author_to_id.items()):
    # print(f"{author}: {idx}")
    if idx==5921:
        print(author)
        break

# Assigns a unique integer ID to each author appearing in the values of a filtered multi-hop influence map.

**Purpose:**
- Converts all co-cited or cited author names (i.e., values in the dictionary) into unique integer IDs starting from 1.

**Inputs:**
- `filtered_author_hop_k(10)(.1).pkl`: A pickle file containing a dictionary where:
  - Keys = author names (sources).
  - Values = lists of reachable/cited author names (targets).

**Output:**
- `author_values_to_id_mapping.pkl`: A dictionary mapping each unique cited author (from the values) to a unique ID.

**Behavior:**
- Iterates over all lists of cited authors.
- Assigns a unique ID to each cited author (ignores keys).
- Saves the result as a `author_values_to_id_mapping.pkl` file.
- Prints total number of unique authors.
- Prints the first 10 author-to-ID mappings for verification.



In [5]:
import pickle

# Path to your single .pkl file
pkl_file = "filtered_author_hop_k(10)(.1).pkl"

author_value_to_id = {}
current_id = 1

# Load the pickle file
with open(pkl_file, "rb") as f:
    data = pickle.load(f)

# Map each author in the values to a unique ID
for values in data.values():
    for author in values:
        if author not in author_value_to_id:
            author_value_to_id[author] = current_id
            current_id += 1

print(f"Total unique authors in values: {len(author_value_to_id)}")

# Save the mapping
with open("author_values_to_id_mapping.pkl", "wb") as f:
    pickle.dump(author_value_to_id, f)

# Print first 10 entries
print("\nFirst 10 author-value-to-ID mappings:")
for i, (author, idx) in enumerate(author_value_to_id.items()):
    print(f"{author}: {idx}")
    if i >= 9:
        break


Total unique authors in values: 71532

First 10 author-value-to-ID mappings:
O. J. Dahl: 1
Stephen P. Morse: 2
Victor H. Yngve: 3
Charles M. Eastman: 4
Gregor V. Bochmann: 5
James Blinn: 6
John Adams: 7
B. W. Arden: 8
Edward H. Friend: 9
L. C. Caruthers: 10


# Converts a filtered author-level multi-hop citation map from author names to unique integer IDs.

**Purpose:**
- Transforms the original mapping from author names to cited author names into a numeric ID-based dictionary.
- Maps each author key to its unique ID, and each cited author in the values to their unique IDs.
- Filters out self-citations (where key ID equals value ID).

**Inputs:**
- `filtered_author_hop_k(10)(.1).pkl`: Dictionary with author keys and lists of cited author names.
- `author_keys_to_id_mapping.pkl`: Dictionary mapping author keys (names) to unique IDs.
- `author_values_to_id_mapping.pkl`: Dictionary mapping cited author names (values) to unique IDs.

**Output:**
- `id_to_id_author_mapping.pkl`: Dictionary mapping author key IDs to lists of cited author IDs.

**Behavior:**
- For each author key, fetch its ID.
- For each cited author in the values, fetch their ID if different from the key’s ID.
- Store key ID mapped to the list of value IDs.
- Prints the first 10 mappings as a sample for verification.



In [6]:
import pickle

# Load the filtered author hop file
with open("filtered_author_hop_k(10)(.1).pkl", "rb") as f:
    filtered_data = pickle.load(f)

# Load the author key to ID mapping
with open("author_keys_to_id_mapping.pkl", "rb") as f:
    key_mapping = pickle.load(f)

# Load the author value to ID mapping
with open("author_values_to_id_mapping.pkl", "rb") as f:
    value_mapping = pickle.load(f)

# Final dictionary to store: key_id -> list of value_ids
id_to_id_dict = {}

for author_key, author_values in filtered_data.items():
    key_id = key_mapping.get(author_key)
    
    # Skip if the key author is not found in the mapping
    if key_id is None:
        continue

    value_ids = []
    for author in author_values:
        # Skip if author is same as key author
        if author == author_key:
            continue
        
        value_id = value_mapping.get(author)
        if value_id is not None and value_id != key_id:
            value_ids.append(value_id)

    id_to_id_dict[key_id] = value_ids

# Print a sample
# print("\n🔹 First 10 key-ID to value-IDs mappings:")
# for i, (k, v) in enumerate(id_to_id_dict.items()):
#     print(f"{k}: {v}")
#     if i >= 9:
#         break

# Save the final dictionary
with open("id_to_id_author_mapping.pkl", "wb") as f:
    pickle.dump(id_to_id_dict, f)




# Creates a mapping from author IDs to their fairness scores.

**Purpose:**
- Converts a mapping of author names to IDs into a mapping of author IDs to fairness scores.
- Facilitates analyses where fairness metrics are needed by author ID rather than by name.

**Inputs:**
- `fairness_values.pkl`: Dictionary mapping author names to their fairness scores.
- `author_keys_to_id_mapping.pkl`: Dictionary mapping author names to unique integer IDs.

**Output:**
- `id_fairness_mapping.pkl`: Dictionary mapping author IDs to their fairness scores.

**How it works:**
- Loads fairness values keyed by author names.
- Loads author-to-ID mapping.
- Creates a new dictionary mapping each author ID to the corresponding fairness score.



In [7]:
import pickle

# Load the fairness values
with open("fairness_values.pkl", "rb") as f:
    fairness_value = pickle.load(f)

# Load the author to ID mapping
with open("author_keys_to_id_mapping.pkl", "rb") as f:
    author_to_id_mapping = pickle.load(f)

# Create the ID to fairness mapping
id_fairness_mapping = {}

for author, id in author_to_id_mapping.items():
    # if author in fairness_value:
    id_fairness_mapping[id] = fairness_value[author]

# Save the new mapping to a file
with open("id_fairness_mapping.pkl", "wb") as f:
    pickle.dump(id_fairness_mapping, f)


# Generates combined vector representations and fairness scores for papers and their authors.

**Purpose:**
- Creates a one-hot vector representing each paper ID.
- Creates a multi-hot vector representing authors linked to each paper.
- Associates each paper with its fairness score.

**Inputs:**
- `id_fairness_mapping.pkl`: Mapping from author IDs to fairness scores.
- `id_to_id_author_mapping.pkl`: Mapping from paper IDs to lists of author IDs.
- `author_keys_to_id_mapping.pkl`: Mapping of paper authors (keys) to unique IDs (for one-hot vector size).
- `author_values_to_id_mapping.pkl`: Mapping of cited authors (values) to unique IDs (for multi-hot vector size).

**Outputs:**
- `paper_author_vectors.pkl`: Dictionary keyed by paper ID, each value containing:
  - `one_hot`: One-hot vector of the paper ID.
  - `multi_hot`: Multi-hot vector representing all authors connected to that paper.
  - `fairness`: Fairness score associated with the paper.

**How it works:**
- Loads all necessary mappings and fairness data.
- For each paper:
  - Creates a one-hot vector where only the paper ID index is set.
  - Creates a multi-hot vector indicating all associated authors.
  - Retrieves the fairness score for the paper.
- Saves the combined data to a single pickle file for downstream use.


In [None]:
import pickle
import numpy as np

# Load fairness values
with open("id_fairness_mapping.pkl", "rb") as f:
    id_fairness_mapping = pickle.load(f)

# Load mappings
with open("id_to_id_author_mapping.pkl", "rb") as f:
    paper_to_authors = pickle.load(f)

with open("author_keys_to_id_mapping.pkl", "rb") as f:
    onehot_mapping = pickle.load(f)

with open("author_values_to_id_mapping.pkl", "rb") as f:
    multihot_mapping = pickle.load(f)


# Vector sizes
onehot_size = len(onehot_mapping)
multihot_size = len(multihot_mapping)

# Final dictionary to store both vectors and fairness
paper_vectors = {}

for paper_id, authors in paper_to_authors.items():
    onehot_vec = np.zeros(onehot_size, dtype=np.uint8)
    multihot_vec = np.zeros(multihot_size, dtype=np.uint8)

    onehot_vec[paper_id - 1] = 1

    author_fairness_values = []
    for author_id in authors:
        multihot_index = author_id - 1
        multihot_vec[multihot_index] = 1

        # Get fairness score if available
        # fairness = id_fairness_mapping.get(author_id)
        # if fairness is not None:
        #     author_fairness_values.append(fairness)
    fairness=id_fairness_mapping[paper_id]
    # # Compute average fairness if any values are present
    # if author_fairness_values:
    #     avg_fairness = float(np.mean(author_fairness_values))
    # else:
    #     avg_fairness = 0.0  # Default if no known fairness scores

    paper_vectors[paper_id] = {
        "one_hot": onehot_vec,
        "multi_hot": multihot_vec,
        "fairness": fairness
    }

# Save to a single pickle file
with open("paper_author_vectors.pkl", "wb") as f:
    pickle.dump(paper_vectors, f)

print("Combined one-hot, multi-hot vectors and fairness values saved to paper_author_vectors.pkl")


# Influence Count Extraction Script

**Purpose:**
This script calculates the number of influenced authors for each paper ID from the `id_to_id_author_mapping.pkl` file and stores the result in a pickle file named `influence_counts_before_after.pkl`.



In [1]:
import pickle

# Load the mapping from 'id_to_id_author_mapping.pkl'
with open('id_to_id_author_mapping.pkl', 'rb') as f:
    id_to_author_mapping = pickle.load(f)

# Create the count dictionary: key → length of list of values
influence_counts = {k: len(v) for k, v in id_to_author_mapping.items()}

# Save the count dictionary to 'influence_counts_before_after.pkl'
with open('influence_counts_before_after.pkl', 'wb') as f:
    pickle.dump(influence_counts, f)

print("influence_counts_before_after.pkl created successfully.")


influence_counts_before_after.pkl created successfully.
