In [4]:
import json
import pandas as pd

path = '/home/scott/projects/dfr_browser2/dist/data/wos_core_article_titles.xlsx'
df = pd.read_excel(path)
df.head()

Unnamed: 0,Authors,Article Title,Journal Title,Date,Year,Vol,Issue,Start Pg,End Pg,DOI Link,UT (Unique WOS ID),Web of Science Index
0,"Hermann, Erik; Puntoni, Stefano",Artificial intelligence and consumer behavior:...,JOURNAL OF BUSINESS RESEARCH,JUL,2024,180.0,,,,http://dx.doi.org/10.1016/j.jbusres.2024.114720,WOS:001244776600001,Social Science Citation Index (SSCI)
1,"Kemp, Ayenda",Competitive Advantage Through Artificial Intel...,ACADEMY OF MANAGEMENT REVIEW,JUL,2024,49.0,3.0,618.0,635.0,http://dx.doi.org/10.5465/amr.2020.0205,WOS:001343256800008,Social Science Citation Index (SSCI)
2,"van der Vlist, Fernando; Helmond, Anne; Ferrar...",Big AI: Cloud infrastructure dependence and th...,BIG DATA & SOCIETY,MAR,2024,11.0,1.0,,,http://dx.doi.org/10.1177/20539517241232630,WOS:001183730300001,Social Science Citation Index (SSCI)
3,"Wang, Qiang; Li, Yuanfan; Li, Rongrong","Ecological footprints, carbon emissions, and e...",HUMANITIES & SOCIAL SCIENCES COMMUNICATIONS,AUG 14,2024,11.0,1.0,,,http://dx.doi.org/10.1057/s41599-024-03520-5,WOS:001290718800005,Social Science Citation Index (SSCI); Arts & H...
4,"Li, Wanlu; Qin, Xin; Yam, Kai Chi; Deng, Huiru...",Embracing artificial intelligence (AI) with jo...,TOURISM MANAGEMENT,OCT,2024,104.0,,,,http://dx.doi.org/10.1016/j.tourman.2024.104935,WOS:001224036500001,Social Science Citation Index (SSCI)


In [17]:
def parse_authors(authors: str) -> str:
    """Parse the authors string into a list of authors."""
    author_list = [author.strip() for author in authors.split(';')]
    authors = []
    for author in author_list:
            try:
                family, given = author.split(',')
                author_dict = {
                    "family": family.strip(),
                    "given": given.strip()
                }
            except ValueError:
                author_dict = {
                    "literal": author.strip()
                }
            authors.append(author_dict)
    return authors

def parse_doi(doi: str) -> str:
    """Parse the DOI string into a URL."""
    doi_list = [doi.replace('http://dx.doi.org/', '') for doi in doi.split(',')]
    return [doi.strip() for doi in doi_list]

def parse_id(id: str) -> str:
    """Parse the ID string into a list of IDs."""
    return id.replace('WOS:', '')

def parse_date(date: str = None, year: str = None) -> str:
    """Parse the date string into a standard format."""
    month_map = {
        "JAN": 1,
        "FEB": 2,
        "MAR": 3,
        "APR": 4,
        "MAY": 5,
        "JUN": 6,
        "JUL": 7,
        "AUG": 8,
        "SEP": 9,
        "OCT": 10,
        "NOV": 11,
        "DEC": 12
    }
    if not year or year == "":
        year = "Unknown"
    if not date or date == "":
        return [year]
    else:
        # Remove the year
        date = str(date)
        if date.startswith(f"{year} "):
            date = date.replace(f"{year} ", "")
        # Format is "Month Day"
        if " " in date:
            month, day = date.split(" ")
            month = month_map[month]
            return [f"{year}-{month}-{day}"]
        # Format is "Month-Month"
        elif "-" in date:
            return [f"{year}-{month_map[x]}" for x in date.split("-")]
        else:
            return [year]

def parse_page(start: str, end: str = None) -> str:
    """Parse the start and end page strings into a single page range."""
    if end or end != "":
        return f"{start}-{end}"
    else:
        return start

In [19]:
data = df.to_dict(orient='records')
bibliography = []
for item in data:
    item_dict = {
        "id": parse_id(item['UT (Unique WOS ID)']),
        "type": "article-journal",
        "author": parse_authors(item['Authors']) if not pd.isna(item['Authors']) else "Unknown",
        "issued": {
            "date-parts": [parse_date(item['Date'], item['Year'])]
        },
        "title": item['Article Title'] if not pd.isna(item['Article Title']) else "Unknown",
        "container-title": item['Journal Title'].title() if not pd.isna(item['Journal Title']) else "Unknown",
        "volume": str(item['Vol']) if not pd.isna(item['Vol']) else None,
        "issue": str(item['Issue']) if not pd.isna(item['Issue']) else None,
        "DOI": parse_doi(item['DOI Link']),
    }
    if not pd.isna(item['Start Pg']):
        item_dict['page'] = parse_page(item['Start Pg'], item['End Pg'])
    bibliography.append(item_dict)

bibliography[0:2]
# with open('bibliography.json', 'w') as f:
#     json.dump(bibliography, f, indent=2)

[{'id': '001244776600001',
  'type': 'article-journal',
  'author': [{'family': 'Hermann', 'given': 'Erik'},
   {'family': 'Puntoni', 'given': 'Stefano'}],
  'issued': {'date-parts': [[2024]]},
  'title': 'Artificial intelligence and consumer behavior: From predictive to generative AI',
  'container-title': 'Journal Of Business Research',
  'volume': '180.0',
  'issue': None,
  'DOI': ['10.1016/j.jbusres.2024.114720']},
 {'id': '001343256800008',
  'type': 'article-journal',
  'author': [{'family': 'Kemp', 'given': 'Ayenda'}],
  'issued': {'date-parts': [[2024]]},
  'title': 'Competitive Advantage Through Artificial Intelligence: Toward a Theory of Situated AI',
  'container-title': 'Academy Of Management Review',
  'volume': '49.0',
  'issue': '3',
  'DOI': ['10.5465/amr.2020.0205'],
  'page': '618-635'}]

In [42]:
import re
# Import citeproc-py components (optional dependency)
try:
    from citeproc import (
        Citation,
        CitationItem,
        CitationStylesBibliography,
        CitationStylesStyle,
    )
    from citeproc.source.json import CiteProcJSON

    CITEPROC_AVAILABLE = True
except ImportError:
    CITEPROC_AVAILABLE = False


def format_citation_with_citeproc(csl_entry, style="chicago-author-date", debug=False):
    """
    Format a single CSL entry as a citation using citeproc-py.

    Args:
        csl_entry (dict): CSL entry to format
        style (str): Citation style to use
        debug (bool): Enable debug output

    Returns:
        str: Formatted citation string or None if formatting fails
    """
    if not CITEPROC_AVAILABLE:
        if debug:
            print(
                f"Warning: citeproc-py not available, skipping citation formatting for {csl_entry.get('id', 'unknown')}"
            )
        return None

    # Try to use actual citeproc-py formatting first
    try:
        if debug:
            print(
                f"Attempting citeproc-py formatting for {csl_entry.get('id', 'unknown')}"
            )

        # Create a bibliography source with just this entry
        bib_source = CiteProcJSON([csl_entry])

        # Load citation style
        try:
            bib_style = CitationStylesStyle(style, validate=False)
            if debug:
                print(f"  ✓ Loaded style: {style}")
        except Exception as e:
            if debug:
                print(f"  ❌ Could not load style '{style}': {e}")
                print("  ↳ Trying chicago-author-date as fallback")
            bib_style = CitationStylesStyle("chicago-author-date", validate=False)

        # Create bibliography
        bibliography = CitationStylesBibliography(bib_style, bib_source)

        # Correct method: Create Citation object with CitationItem
        citation = Citation([CitationItem(csl_entry["id"])])

        # Register the citation
        bibliography.register(citation)

        # Generate bibliography
        bibliography_items = bibliography.bibliography()

        # Generate the formatted citation
        if bibliography_items and len(bibliography_items) > 0:
            citation_str = str(bibliography_items[0]).strip()
            # Remove any HTML tags and decode HTML entities
            citation_str = re.sub(r"<[^>]+>", "", citation_str)
            citation_str = (
                citation_str.replace("&amp;", "&")
                .replace("&lt;", "<")
                .replace("&gt;", ">")
                .replace("&quot;", '"')
            )
            # Fix double periods (common in APA style after initials)
            citation_str = re.sub(r"\.\.+", ".", citation_str)
            if debug:
                print(f"  ✓ Generated citation: {citation_str}")
            return citation_str
        else:
            if debug:
                print("  ❌ No bibliography items generated")
            raise Exception("No bibliography items generated")

    except Exception as citeproc_error:
        if debug:
            print(f"  ❌ citeproc-py formatting failed: {citeproc_error}")
            print("  ↳ Using fallback citation format")

    # Fallback to simple citation format
    if debug:
        print(f"Creating fallback citation for {csl_entry.get('id', 'unknown')}")

    try:
        # Create a simple citation format as fallback
        citation_parts = []

        # Add authors
        if "author" in csl_entry and csl_entry["author"]:
            authors = csl_entry["author"]
            if isinstance(authors, list) and len(authors) > 0:
                if len(authors) == 1:
                    author = authors[0]
                    if "family" in author:
                        # Format as "Family, Given" for single author
                        author_name = author["family"]
                        if "given" in author:
                            author_name = f"{author['family']}, {author['given']}"
                        citation_parts.append(author_name)
                elif len(authors) <= 3:
                    author_names = []
                    for author in authors:
                        if "family" in author:
                            # Format as "Family, Given" for multiple authors
                            author_name = author["family"]
                            if "given" in author:
                                author_name = f"{author['family']}, {author['given']}"
                            author_names.append(author_name)
                    citation_parts.append("; ".join(author_names))
                else:
                    if "family" in authors[0]:
                        # Format first author with given name, then "et al."
                        first_author = authors[0]["family"]
                        if "given" in authors[0]:
                            first_author = (
                                f"{authors[0]['family']}, {authors[0]['given']}"
                            )
                        citation_parts.append(f"{first_author} et al.")

        # Add year
        if "issued" in csl_entry and csl_entry["issued"]:
            date_parts = csl_entry["issued"].get("date-parts", [])
            if date_parts and len(date_parts[0]) > 0:
                citation_parts.append(f"({date_parts[0][0]})")

        # Add title
        if "title" in csl_entry:
            citation_parts.append(f'"{csl_entry["title"]}"')

        # Add container
        if "container-title" in csl_entry:
            citation_parts.append(f"<em>{csl_entry['container-title']}</em>")

        fallback_citation = ". ".join(citation_parts) + "."

        if debug:
            print(f"Created fallback citation: {fallback_citation}")

        return fallback_citation

    except Exception as e:
        if debug:
            print(
                f"Error creating fallback citation for {csl_entry.get('id', 'unknown')}: {e}"
            )
        return None

In [43]:
formatted_bibliography = []
for entry in bibliography:
    formatted_citation = format_citation_with_citeproc(entry, style="chicago-author-date", debug=False)
    if formatted_citation:
        formatted_citation = formatted_citation.replace("(None). http", "http")
        entry['formatted-citation'] = formatted_citation
        entry['container-title'] = f"<em>{entry['container-title']}</em>"
        formatted_bibliography.append(entry)
formatted_bibliography[0:5]


  warn('The following arguments for {} are '.format(cls_name) +


[{'id': '001244776600001',
  'type': 'article-journal',
  'author': [{'family': 'Hermann', 'given': 'Erik'},
   {'family': 'Puntoni', 'given': 'Stefano'}],
  'issued': {'date-parts': [[2024]]},
  'title': 'Artificial intelligence and consumer behavior: From predictive to generative AI',
  'container-title': '<em>Journal Of Business Research</em>',
  'volume': '180.0',
  'issue': None,
  'DOI': ['10.1016/j.jbusres.2024.114720'],
  'formatted-citation': "Hermann, Erik, and Stefano Puntoni. 2024. “Artificial Intelligence and Consumer Behavior: From Predictive to Generative AI”. Journal Of Business Research 180.0 https://doi.org/['10.1016/j.jbusres.2024.114720']."},
 {'id': '001343256800008',
  'type': 'article-journal',
  'author': [{'family': 'Kemp', 'given': 'Ayenda'}],
  'issued': {'date-parts': [[2024]]},
  'title': 'Competitive Advantage Through Artificial Intelligence: Toward a Theory of Situated AI',
  'container-title': '<em>Academy Of Management Review</em>',
  'volume': '49.0',


In [44]:
with open('/home/scott/projects/dfr_browser2/dist/data/liu/bibliography.json', 'w') as f:
    json.dump(formatted_bibliography, f, indent=2)

In [80]:
path = '/home/scott/projects/dfr_browser2/dist/data/liu/wos_core_article_titles.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,docNum,docName,title,author,year,journal,date,volume,issue,start_page,end_page
0,0,,Artificial intelligence and consumer behavior:...,"Hermann, Erik; Puntoni, Stefano",2024,JOURNAL OF BUSINESS RESEARCH,JUL,180.0,,,
1,0,,Competitive Advantage Through Artificial Intel...,"Kemp, Ayenda",2024,ACADEMY OF MANAGEMENT REVIEW,JUL,49.0,3.0,618.0,635.0
2,0,,Big AI: Cloud infrastructure dependence and th...,"van der Vlist, Fernando; Helmond, Anne; Ferrar...",2024,BIG DATA & SOCIETY,MAR,11.0,1.0,,
3,0,,"Ecological footprints, carbon emissions, and e...","Wang, Qiang; Li, Yuanfan; Li, Rongrong",2024,HUMANITIES & SOCIAL SCIENCES COMMUNICATIONS,AUG 14,11.0,1.0,,
4,0,,Embracing artificial intelligence (AI) with jo...,"Li, Wanlu; Qin, Xin; Yam, Kai Chi; Deng, Huiru...",2024,TOURISM MANAGEMENT,OCT,104.0,,,


In [82]:
df.docNum = [f"{x}" for x in range(0, len(df))]
df.docName = [f"Doc{x}" for x in range(1, len(df)+1)]
df.head()

Unnamed: 0,docNum,docName,title,author,year,journal,date,volume,issue,start_page,end_page
0,0,Doc1,Artificial intelligence and consumer behavior:...,"Hermann, Erik; Puntoni, Stefano",2024,JOURNAL OF BUSINESS RESEARCH,JUL,180.0,,,
1,1,Doc2,Competitive Advantage Through Artificial Intel...,"Kemp, Ayenda",2024,ACADEMY OF MANAGEMENT REVIEW,JUL,49.0,3.0,618.0,635.0
2,2,Doc3,Big AI: Cloud infrastructure dependence and th...,"van der Vlist, Fernando; Helmond, Anne; Ferrar...",2024,BIG DATA & SOCIETY,MAR,11.0,1.0,,
3,3,Doc4,"Ecological footprints, carbon emissions, and e...","Wang, Qiang; Li, Yuanfan; Li, Rongrong",2024,HUMANITIES & SOCIAL SCIENCES COMMUNICATIONS,AUG 14,11.0,1.0,,
4,4,Doc5,Embracing artificial intelligence (AI) with jo...,"Li, Wanlu; Qin, Xin; Yam, Kai Chi; Deng, Huiru...",2024,TOURISM MANAGEMENT,OCT,104.0,,,


In [83]:
journal = [x.title() for x in df.journal.values.tolist()]
df.journal = journal
df.head()

Unnamed: 0,docNum,docName,title,author,year,journal,date,volume,issue,start_page,end_page
0,0,Doc1,Artificial intelligence and consumer behavior:...,"Hermann, Erik; Puntoni, Stefano",2024,Journal Of Business Research,JUL,180.0,,,
1,1,Doc2,Competitive Advantage Through Artificial Intel...,"Kemp, Ayenda",2024,Academy Of Management Review,JUL,49.0,3.0,618.0,635.0
2,2,Doc3,Big AI: Cloud infrastructure dependence and th...,"van der Vlist, Fernando; Helmond, Anne; Ferrar...",2024,Big Data & Society,MAR,11.0,1.0,,
3,3,Doc4,"Ecological footprints, carbon emissions, and e...","Wang, Qiang; Li, Yuanfan; Li, Rongrong",2024,Humanities & Social Sciences Communications,AUG 14,11.0,1.0,,
4,4,Doc5,Embracing artificial intelligence (AI) with jo...,"Li, Wanlu; Qin, Xin; Yam, Kai Chi; Deng, Huiru...",2024,Tourism Management,OCT,104.0,,,


In [33]:
s = df.start_page.values.tolist()
e = df.end_page.values.tolist()
r = [f"{s[i]}-{e[i]}" if not pd.isna(e[i]) else f"{s[i]}" for i in range(len(s))]
r = [x.replace('nan', '') for x in r]
r[0:20]
df['page_range'] = r
df.head()

Unnamed: 0,docNum,docName,title,author,year,journal,date,volume,issue,start_page,end_page,page_range
0,0,Doc1,Artificial intelligence and consumer behavior:...,"Hermann, Erik; Puntoni, Stefano",2024,Journal Of Business Research,JUL,180.0,,,,
1,0,Doc2,Competitive Advantage Through Artificial Intel...,"Kemp, Ayenda",2024,Academy Of Management Review,JUL,49.0,3.0,618.0,635.0,618-635
2,0,Doc3,Big AI: Cloud infrastructure dependence and th...,"van der Vlist, Fernando; Helmond, Anne; Ferrar...",2024,Big Data & Society,MAR,11.0,1.0,,,
3,0,Doc4,"Ecological footprints, carbon emissions, and e...","Wang, Qiang; Li, Yuanfan; Li, Rongrong",2024,Humanities & Social Sciences Communications,AUG 14,11.0,1.0,,,
4,0,Doc5,Embracing artificial intelligence (AI) with jo...,"Li, Wanlu; Qin, Xin; Yam, Kai Chi; Deng, Huiru...",2024,Tourism Management,OCT,104.0,,,,


In [84]:
final_df = df.drop(columns=['start_page', 'end_page'])
final_df.head()

Unnamed: 0,docNum,docName,title,author,year,journal,date,volume,issue
0,0,Doc1,Artificial intelligence and consumer behavior:...,"Hermann, Erik; Puntoni, Stefano",2024,Journal Of Business Research,JUL,180.0,
1,1,Doc2,Competitive Advantage Through Artificial Intel...,"Kemp, Ayenda",2024,Academy Of Management Review,JUL,49.0,3.0
2,2,Doc3,Big AI: Cloud infrastructure dependence and th...,"van der Vlist, Fernando; Helmond, Anne; Ferrar...",2024,Big Data & Society,MAR,11.0,1.0
3,3,Doc4,"Ecological footprints, carbon emissions, and e...","Wang, Qiang; Li, Yuanfan; Li, Rongrong",2024,Humanities & Social Sciences Communications,AUG 14,11.0,1.0
4,4,Doc5,Embracing artificial intelligence (AI) with jo...,"Li, Wanlu; Qin, Xin; Yam, Kai Chi; Deng, Huiru...",2024,Tourism Management,OCT,104.0,


In [85]:
path = '/home/scott/projects/dfr_browser2/dist/data/liu/metadata.csv'
final_df.to_csv(path, index=False)


In [None]:
import citeproc as c
import citeproc_styles as cs
s = c.CitationStylesStyle(cs.get_style_filepath("chicago-author-date"))
r =   {
    "id": "001244776600001",
    "type": "article-journal",
    "author": [
      {
        "family": "Hermann",
        "given": "Erik"
      },
      {
        "family": "Puntoni",
        "given": "Stefano"
      }
    ],
    "issued": {
      "date-parts": [
        [
          2024
        ]
      ]
    },
    "title": "Artificial intelligence and consumer behavior: From predictive to generative AI",
    "container-title": "Journal Of Business Research",
    "volume": "180.0",
    "issue": None,
    "DOI": [
      "10.1016/j.jbusres.2024.114720"
    ]
  }


[]

In [None]:
from citeproc import CitationStylesStyle, CitationStylesBibliography, Citation, CitationItem
from citeproc.source.json import CiteProcJSON
from citeproc_styles import get_style_filepath
import re

def format_csl_to_html(csl_entry, style="chicago-author-date"):
    """
    Format a CSL entry as HTML with proper styling.

    Args:
        csl_entry: Dictionary with CSL JSON data
        style: Citation style name

    Returns:
        str: HTML-formatted citation with italicized journal title
    """
    # Create bibliography source
    bib_source = CiteProcJSON([csl_entry])

    # Load citation style
    bib_style = CitationStylesStyle(get_style_filepath(style), validate=False)

    # Create bibliography
    bibliography = CitationStylesBibliography(bib_style, bib_source)

    # Create and register citation
    citation = Citation([CitationItem(csl_entry["id"])])
    bibliography.register(citation)

    # Generate bibliography
    bib_items = bibliography.bibliography()

    if bib_items and len(bib_items) > 0:
        # Get the formatted citation as HTML string
        html_citation = str(bib_items[0]).strip()

        # The container-title should already be in italics from the style
        # But if you want to ensure it, you can wrap it:
        container = csl_entry.get("container-title", "")
        if container and f"<i>{container}</i>" not in html_citation:
            # Replace plain container title with italicized version
            html_citation = html_citation.replace(container, f"<em>{container}</em>")

        return html_citation.replace("<i>", "").replace("</i>", "")

    return None

# Use it with your r object:
formatted_html = format_csl_to_html(r)
print(formatted_html)

Hermann, Erik, and Stefano Puntoni. 2024. “Artificial Intelligence and Consumer Behavior: From Predictive to Generative AI”. <em>Journal Of Business Research</em> 180.0 (None). https://doi.org/['10.1016/j.jbusres.2024.114720'].


In [77]:
with open('/home/scott/projects/dfr_browser2/dist/data/liu/bibliography.json', 'r') as f:
    bibliography = json.load(f)

formatted_bibliography = []
for i, entry in enumerate(bibliography):
    e = {k: str(v) for k, v in entry.items() if k != 'formatted-citation'}
    dp = entry['issued']['date-parts']
    for part in dp:
        if isinstance(part[0], str) and "-" in part[0]:
            date = part[0].split("-")
            y = date[0]
            m = date[1] if len(date) > 1 else "1"
            d = date[2] if len(date) > 2 else "1"
            entry['issued'] = {"date-parts": [[int(y), int(m), int(d)]]}
    fc = format_csl_to_html(entry)
    entry['formatted-citation'] = fc
    formatted_bibliography.append(entry)
formatted_bibliography[0:5]

[{'id': '001244776600001',
  'type': 'article-journal',
  'author': [{'family': 'Hermann', 'given': 'Erik'},
   {'family': 'Puntoni', 'given': 'Stefano'}],
  'issued': {'date-parts': [[2024]]},
  'title': 'Artificial intelligence and consumer behavior: From predictive to generative AI',
  'container-title': '<em>Journal Of Business Research</em>',
  'volume': '180.0',
  'issue': None,
  'DOI': ['10.1016/j.jbusres.2024.114720'],
  'formatted-citation': "Hermann, Erik, and Stefano Puntoni. 2024. “Artificial Intelligence and Consumer Behavior: From Predictive to Generative AI”. <em>Journal Of Business Research</em> 180.0 (None). https://doi.org/['10.1016/j.jbusres.2024.114720']."},
 {'id': '001343256800008',
  'type': 'article-journal',
  'author': [{'family': 'Kemp', 'given': 'Ayenda'}],
  'issued': {'date-parts': [[2024]]},
  'title': 'Competitive Advantage Through Artificial Intelligence: Toward a Theory of Situated AI',
  'container-title': '<em>Academy Of Management Review</em>',
  '

In [78]:
with open('/home/scott/projects/dfr_browser2/dist/data/liu/bibliography.json', 'w') as f:
    json.dump(formatted_bibliography, f, indent=2)