In [1]:
# GPU Test
import torch
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if
torch.cuda.is_available() else "No GPU")

CUDA available: True
GPU name: Tesla T4


In [2]:
from google.colab import userdata
from huggingface_hub import login
login(token=userdata.get('HF_TOKEN'))

In [3]:
from huggingface_hub import list_repo_files

# List all files in the repository
repo_id = "ashtrayAI/Bangla_Financial_news_articles_Dataset"
files = list_repo_files(repo_id=repo_id, repo_type="dataset")

# Show first 10 files
print("First 10 files:")
for i, file in enumerate(files[:10]):
    print(f"{i+1}. {file}")

print(f"\nTotal files: {len(files)}")

First 10 files:
1. .gitattributes
2. Bangla_fin_news_articles/1.csv
3. Bangla_fin_news_articles/10.csv
4. Bangla_fin_news_articles/100.csv
5. Bangla_fin_news_articles/1000.csv
6. Bangla_fin_news_articles/1001.csv
7. Bangla_fin_news_articles/1002.csv
8. Bangla_fin_news_articles/1003.csv
9. Bangla_fin_news_articles/1004.csv
10. Bangla_fin_news_articles/1005.csv

Total files: 7698


In [4]:
# Focus on CSV files in the specific folder
folder = "Bangla_fin_news_articles"
csv_files = [f for f in files if f.startswith(folder + "/") and f.endswith(".csv")]

print(f"Found {len(csv_files)} CSV files in {folder}/ folder")
print("\nFirst 5 CSV files:")

for i, file in enumerate(csv_files[:5]):
  print(f"{i+1}. {file}")

Found 7695 CSV files in Bangla_fin_news_articles/ folder

First 5 CSV files:
1. Bangla_fin_news_articles/1.csv
2. Bangla_fin_news_articles/10.csv
3. Bangla_fin_news_articles/100.csv
4. Bangla_fin_news_articles/1000.csv
5. Bangla_fin_news_articles/1001.csv


In [5]:
from huggingface_hub import hf_hub_download
from pandas import DataFrame
import pandas as pd

# Download and examine one CSV file
sample_file = csv_files[0]  # First CSV file
local_path = hf_hub_download(
  repo_id=repo_id,
  filename=sample_file,
  repo_type="dataset",
  token=userdata.get('HF_TOKEN')
)

# Load and examine the data
sample_df = pd.read_csv(local_path)
print("Sample DataFrame Info:")
print(f"Shape: {sample_df.shape}")
print(f"Columns: {list(sample_df.columns)}")
print("\nFirst 3 rows:")
print(sample_df.head(3))

1.csv: 0.00B [00:00, ?B/s]

Sample DataFrame Info:
Shape: (1, 5)
Columns: ['Serial', 'Title', 'Date', 'Author', 'News']

First 3 rows:
   Serial                                              Title        Date  \
0       1  শেয়ারবাজারে ব্যাংকগুলোর বিনিয়োগের হিসাব ক্রয়...  2022-08-05   

              Author                                               News  
0  ইত্তেফাক রিপোর্ট  এখন থেকে শেয়ারবাজারে ব্যাংকগুলোর বিনিয়োগের হ...  


In [6]:
# Analyze text lengths
if 'News' in sample_df.columns:
    text_lengths = sample_df['News'].str.len()
    print(f"Text length statistics:")
    print(f"Average: {text_lengths.mean():.0f} characters")
    print(f"Min: {text_lengths.min()} characters")
    print(f"Max: {text_lengths.max()} characters")

    # Show a sample article
    print("\nSample article:")
    print(sample_df['News'].iloc[0][:200] + "...")

Text length statistics:
Average: 2854 characters
Min: 2854 characters
Max: 2854 characters

Sample article:
এখন থেকে শেয়ারবাজারে ব্যাংকগুলোর বিনিয়োগের হিসাব শেয়ারের ক্রয়মূল্যের ভিত্তিতে নির্ধারিত হবে। ফলে বাজারে শেয়ারের দাম বাড়ুক বা কমুক ব্যাংকের ধারণকৃত শেয়ার বিক্রির চাপ তৈরি হবে না। আবার বিনিয়োগও আইনি...
