Access to the dataset and Fetching descriptions

In [4]:
import pandas as pd
import requests
import re

# === CONFIG ===
GITHUB_TOKEN = "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"
INPUT_FILES = {
    "top": "dataset/top_lib.csv",
    "middle": "dataset/middle_lib.csv",
    "bottom": "dataset/bottom_lib.csv"
}
OUTPUT_CSV = "descriptions_output.csv"

# === FUNCTIONS ===

def extract_owner_repo(url):
    """Extracts 'owner/repo' from a GitHub URL."""
    match = re.search(r"github\.com/([^/]+/[^/]+)", url)
    return match.group(1) if match else None

def fetch_description(owner_repo):
    """Fetches the GitHub repo description using the API."""
    url = f"https://api.github.com/repos/{owner_repo}"
    headers = {
        "Authorization": f"token {GITHUB_TOKEN}",
        "Accept": "application/vnd.github.v3+json"
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        return response.json().get("description", "")
    return f"Error: {response.status_code}"

# === LOAD AND COMBINE DATA ===

all_dfs = []

for source, file_path in INPUT_FILES.items():
    df = pd.read_csv(file_path)
    df['owner_repo'] = df['Repository_URL'].apply(extract_owner_repo)
    df['source'] = source
    all_dfs.append(df)

combined_df = pd.concat(all_dfs, ignore_index=True)
combined_df = combined_df.dropna(subset=["owner_repo"])  # drop rows with no valid repo

# === FETCH DESCRIPTIONS ===

unique_repos = combined_df['owner_repo'].unique()
descriptions = {repo: fetch_description(repo) for repo in unique_repos}

# === PREPARE OUTPUT ===

output_df = combined_df[['owner_repo', 'source']].drop_duplicates()
output_df['description'] = output_df['owner_repo'].map(descriptions)
output_df = output_df.rename(columns={"owner_repo": "library"})

# Clean library name (keep only repo name)
output_df['library'] = output_df['library'].astype(str).apply(lambda x: x.split('/')[-1])

# === SAVE OUTPUT ===

output_df.to_csv(OUTPUT_CSV, index=False)
print(f"✅ Saved descriptions with source to {OUTPUT_CSV}")


Saved descriptions with source to descriptions_output.csv


Create a list only description

In [10]:
import pandas as pd

# Load the full CSV
df = pd.read_csv("descriptions_output.csv")

# Keep only the needed columns
task_df = df[["description", "source"]]

# Save to new CSV
task_df.to_csv("task_list.csv", index=False)

print("task_list.csv saved with only description and source.")


task_list.csv saved with only description and source.


Check number of null description

In [7]:
import pandas as pd

df = pd.read_csv("descriptions_output.csv")

# Group by source, then count NaNs in 'description' column
nan_counts_by_source = df.groupby('source')['description'].apply(lambda x: x.isna().sum())

print("Number of NaN descriptions by source:")
print(nan_counts_by_source)

df_cleaned = df.dropna(subset=['description'])

# Save cleaned data
df_cleaned.to_csv("task_list.csv", index=False)

print(f"Removed {len(df) - len(df_cleaned)} rows with NaN description.")


Number of NaN descriptions by source:
source
bottom    20
middle     9
top       12
Name: description, dtype: int64
Removed 41 rows with NaN description.


Extract the library name from dataset

In [9]:
import pandas as pd

# Load the original CSV
df = pd.read_csv("descriptions_output.csv")

df['lib_name'] = df['library']

# Keep only the desired columns
output_df = df[['lib_name', 'source']]

# Save to new CSV
output_df.to_csv("lib_name.csv", index=False)


In [None]:
import pandas as pd

# Load the original CSV
df = pd.read_csv('descriptions_output.csv')

# Modify the 'library' column in place
df['library'] = df['library'].astype(str).apply(lambda x: x.split('/')[-1])

# Save back to the same file
df.to_csv('descriptions_output.csv', index=False)
