In [2]:
import re
import pandas as pd
from transformers import AutoTokenizer
from qdrant_client import QdrantClient

# Initialize Qdrant client and tokenizer
client = QdrantClient(host='localhost')
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Load data
data = pd.read_csv("financial_news.csv")

# Text normalization function
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply preprocessing
data['cleaned_text'] = data['text'].apply(preprocess_text)

# Tokenization and embedding
data['tokens'] = data['cleaned_text'].apply(tokenizer.tokenize)
embeddings = tokenizer(data['cleaned_text'], return_tensors="pt", padding=True, truncation=True)

# Store embeddings in Qdrant
for idx, embedding in enumerate(embeddings['input_ids']):
    client.upload_collection(name="finance_advisor", points=[
        {"id": idx, "vector": embedding.tolist(), "payload": {"text": data['cleaned_text'][idx]}}
    ])


SyntaxError: unmatched ']' (<ipython-input-2-9297da88a7bf>, line 32)

In [3]:
pip install qdrant_client

Collecting qdrant_client
  Downloading qdrant_client-1.12.0-py3-none-any.whl.metadata (10 kB)
Collecting grpcio-tools>=1.41.0 (from qdrant_client)
  Downloading grpcio_tools-1.66.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.3 kB)
Collecting httpx>=0.20.0 (from httpx[http2]>=0.20.0->qdrant_client)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting portalocker<3.0.0,>=2.7.0 (from qdrant_client)
  Downloading portalocker-2.10.1-py3-none-any.whl.metadata (8.5 kB)
Collecting protobuf<6.0dev,>=5.26.1 (from grpcio-tools>=1.41.0->qdrant_client)
  Downloading protobuf-5.28.2-cp38-abi3-manylinux2014_x86_64.whl.metadata (592 bytes)
Collecting grpcio>=1.41.0 (from qdrant_client)
  Downloading grpcio-1.66.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.9 kB)
Collecting httpcore==1.* (from httpx>=0.20.0->httpx[http2]>=0.20.0->qdrant_client)
  Downloading httpcore-1.0.6-py3-none-any.whl.metadata (21 kB)
Collecting h11<0.15,>=0

In [None]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import loralib as lora  # Hypothetical LoRA library for demonstration

# Load base model and modify layers for LoRA adaptation
model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased")
model = lora.lora_model(model, rank=4)  # Applying LoRA with a specified rank

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    logging_dir='./logs',
    evaluation_strategy="epoch"
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=financial_train_dataset,
    eval_dataset=financial_eval_dataset
)

trainer.train()


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# Generate word cloud
text = " ".join(review for review in data['cleaned_text'])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Plot word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("Word Cloud of Financial Terms")
plt.show()


In [None]:
# Calculate prompt lengths
data['prompt_length'] = data['cleaned_text'].apply(lambda x: len(x.split()))

# Plot histogram of prompt lengths
plt.figure(figsize=(10, 5))
plt.hist(data['prompt_length'], bins=20, color='skyblue', edgecolor='black')
plt.title("Prompt Length Distribution")
plt.xlabel("Prompt Length (words)")
plt.ylabel("Frequency")
plt.show()
