In [1]:
import json
import pandas as pd
from datasets import Dataset, load_dataset
from huggingface_hub import HfApi, HfFolder

  from .autonotebook import tqdm as notebook_tqdm


In [46]:
context_text = "This system provides tailored financial advice by analyzing individual risk tolerance and investment goals, offering optimized asset allocation strategies to balance risk and return. It helps users make informed decisions by considering current market conditions and their personal financial objectives"

In [47]:
with open('custom_dataset.json', 'r') as f:
    custom_data = json.load(f)

In [48]:
for item in custom_data:
    item['question'] = item.pop('input')
    item['answer'] = item.pop('output')
    item.setdefault('context', context_text)  # Assuming you might not have context
    item.setdefault('ticker', 'UNKNOWN')  # Placeholder for ticker if not available
    item.setdefault('filing', 'N/A')  # Placeholder for filing type if not available


In [49]:
df_custom = pd.DataFrame(custom_data)

In [50]:
df_custom.head()

Unnamed: 0,question,answer,context,ticker,filing
0,I'm nearing retirement. What kind of investmen...,"For retirement, it's wise to focus on income-g...",This system provides tailored financial advice...,UNKNOWN,
1,How should I adjust my portfolio in response t...,Rising inflation can erode the value of fixed-...,This system provides tailored financial advice...,UNKNOWN,
2,Is it a good time to invest in tech stocks giv...,Tech stocks have been volatile recently due to...,This system provides tailored financial advice...,UNKNOWN,
3,I'm a risk-averse investor. How can I maximize...,"For a risk-averse investor, a diversified port...",This system provides tailored financial advice...,UNKNOWN,
4,What are some strategies for investing in a be...,"In a bear market, it's crucial to focus on pre...",This system provides tailored financial advice...,UNKNOWN,


In [51]:
financial_qa_10K = load_dataset("virattt/financial-qa-10K", split='train')


In [52]:
df_existing = pd.DataFrame(financial_qa_10K)

In [53]:
df_existing.head()

Unnamed: 0,question,answer,context,ticker,filing
0,What area did NVIDIA initially focus on before...,NVIDIA initially focused on PC graphics.,"Since our original focus on PC graphics, we ha...",NVDA,2023_10K
1,What are some of the recent applications of GP...,Recent applications of GPU-powered deep learni...,Some of the most recent applications of GPU-po...,NVDA,2023_10K
2,What significant invention did NVIDIA create i...,NVIDIA invented the GPU in 1999.,Our invention of the GPU in 1999 defined moder...,NVDA,2023_10K
3,How does NVIDIA's platform strategy contribute...,NVIDIA's platform strategy brings together har...,"NVIDIA has a platform strategy, bringing toget...",NVDA,2023_10K
4,What does NVIDIA's CUDA programming model enable?,NVIDIA's CUDA programming model opened the par...,With our introduction of the CUDA programming ...,NVDA,2023_10K


In [54]:
df_combined = pd.concat([df_existing, df_custom], ignore_index=True)


In [55]:
df_combined.head()

Unnamed: 0,question,answer,context,ticker,filing
0,What area did NVIDIA initially focus on before...,NVIDIA initially focused on PC graphics.,"Since our original focus on PC graphics, we ha...",NVDA,2023_10K
1,What are some of the recent applications of GP...,Recent applications of GPU-powered deep learni...,Some of the most recent applications of GPU-po...,NVDA,2023_10K
2,What significant invention did NVIDIA create i...,NVIDIA invented the GPU in 1999.,Our invention of the GPU in 1999 defined moder...,NVDA,2023_10K
3,How does NVIDIA's platform strategy contribute...,NVIDIA's platform strategy brings together har...,"NVIDIA has a platform strategy, bringing toget...",NVDA,2023_10K
4,What does NVIDIA's CUDA programming model enable?,NVIDIA's CUDA programming model opened the par...,With our introduction of the CUDA programming ...,NVDA,2023_10K


In [56]:
df_combined.tail()

Unnamed: 0,question,answer,context,ticker,filing
7045,What asset allocation should I use for long-te...,"For long-term wealth building, you might consi...",This system provides tailored financial advice...,UNKNOWN,
7046,I’m concerned about market volatility. How can...,"To mitigate market volatility, consider a 50/5...",This system provides tailored financial advice...,UNKNOWN,
7047,I’m a young investor with a high appetite for ...,"With a high risk tolerance and long horizon, a...",This system provides tailored financial advice...,UNKNOWN,
7048,How should I allocate my assets if I want to f...,"For capital preservation, a conservative alloc...",This system provides tailored financial advice...,UNKNOWN,
7049,I’m interested in a strategy that combines gro...,A balanced strategy might include 50% in equit...,This system provides tailored financial advice...,UNKNOWN,


In [57]:
hf_combined_dataset = Dataset.from_pandas(df_combined)

In [58]:
hf_combined_dataset[:5]

{'question': ['What area did NVIDIA initially focus on before expanding to other computationally intensive fields?',
  'What are some of the recent applications of GPU-powered deep learning as mentioned by NVIDIA?',
  'What significant invention did NVIDIA create in 1999?',
  "How does NVIDIA's platform strategy contribute to the markets it serves?",
  "What does NVIDIA's CUDA programming model enable?"],
 'answer': ['NVIDIA initially focused on PC graphics.',
  'Recent applications of GPU-powered deep learning include recommendation systems, large language models, and generative AI.',
  'NVIDIA invented the GPU in 1999.',
  "NVIDIA's platform strategy brings together hardware, systems, software, algorithms, libraries, and services to create unique value.",
  "NVIDIA's CUDA programming model opened the parallel processing capabilities of GPUs for general purpose computing."],
 'context': ['Since our original focus on PC graphics, we have expanded to several other large and important co

In [59]:
df_combined.tail()

Unnamed: 0,question,answer,context,ticker,filing
7045,What asset allocation should I use for long-te...,"For long-term wealth building, you might consi...",This system provides tailored financial advice...,UNKNOWN,
7046,I’m concerned about market volatility. How can...,"To mitigate market volatility, consider a 50/5...",This system provides tailored financial advice...,UNKNOWN,
7047,I’m a young investor with a high appetite for ...,"With a high risk tolerance and long horizon, a...",This system provides tailored financial advice...,UNKNOWN,
7048,How should I allocate my assets if I want to f...,"For capital preservation, a conservative alloc...",This system provides tailored financial advice...,UNKNOWN,
7049,I’m interested in a strategy that combines gro...,A balanced strategy might include 50% in equit...,This system provides tailored financial advice...,UNKNOWN,


In [60]:
with open('output.json', 'r') as f:
    custom_data = json.load(f)

In [61]:
for item in custom_data:
    item['question'] = item.pop('user')
    item['answer'] = item.pop('system')
    item.setdefault('context', context_text)  # Assuming you might not have context
    item.setdefault('ticker', 'UNKNOWN')  # Placeholder for ticker if not available
    item.setdefault('filing', 'N/A')  # Placeholder for filing type if not available

In [62]:
df_custom = pd.DataFrame(custom_data)

In [63]:
df_custom.tail()

Unnamed: 0,question,answer,context,ticker,filing
832,I want to invest in global markets for diversi...,Here’s the summary of your globally diversifie...,This system provides tailored financial advice...,UNKNOWN,
833,I’m nearing retirement and want to minimize ri...,"Here’s the summary of your low-risk, growth-or...",This system provides tailored financial advice...,UNKNOWN,
834,I’m looking for a strategy that prioritizes in...,Here’s the summary of your income-focused stra...,This system provides tailored financial advice...,UNKNOWN,
835,I’m interested in ethical investing with a foc...,Here’s the summary of your growth-oriented ESG...,This system provides tailored financial advice...,UNKNOWN,
836,I’m looking to balance growth with income as I...,Here’s the summary of your balanced growth and...,This system provides tailored financial advice...,UNKNOWN,


In [64]:
df_combined = pd.concat([df_combined, df_custom], ignore_index=True)

In [65]:
df_combined.tail()

Unnamed: 0,question,answer,context,ticker,filing
7882,I want to invest in global markets for diversi...,Here’s the summary of your globally diversifie...,This system provides tailored financial advice...,UNKNOWN,
7883,I’m nearing retirement and want to minimize ri...,"Here’s the summary of your low-risk, growth-or...",This system provides tailored financial advice...,UNKNOWN,
7884,I’m looking for a strategy that prioritizes in...,Here’s the summary of your income-focused stra...,This system provides tailored financial advice...,UNKNOWN,
7885,I’m interested in ethical investing with a foc...,Here’s the summary of your growth-oriented ESG...,This system provides tailored financial advice...,UNKNOWN,
7886,I’m looking to balance growth with income as I...,Here’s the summary of your balanced growth and...,This system provides tailored financial advice...,UNKNOWN,


In [66]:
df_combined.to_csv('combined_financial_dataset.csv', index=False)

In [67]:
hf_combined_dataset = Dataset.from_pandas(df_combined)

In [68]:
len(hf_combined_dataset)

7887

In [69]:
dataset_repo_name = "shashankyadav03/asset-investment-financial-dataset"


In [70]:
hf_combined_dataset.push_to_hub(dataset_repo_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]
Creating parquet from Arrow format: 100%|██████████| 8/8 [00:00<00:00, 301.21ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.44it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/shashankyadav03/asset-investment-financial-dataset/commit/38d077b597a7e799a09f4e38d16e6a3000c535b3', commit_message='Upload dataset', commit_description='', oid='38d077b597a7e799a09f4e38d16e6a3000c535b3', pr_url=None, pr_revision=None, pr_num=None)