In [41]:
import torch
import json
import http.client, urllib.parse
import faiss
import numpy as np
import pandas as pd

from datasets import load_metric
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline, AutoModel


In [48]:
#requirement 
# torch                    2.1.2
# pandas                   2.1.4'
# numpy                    1.26.3
# faiss-gpu                1.7.2
# transformers             4.36.2
# fastparquet              2023.10.1
#datasets

In [14]:
access_key = "df1c1027bc3dd38b6cddb5e53a1ec1da"

In [15]:
print(torch.__version__)

2.1.2+cu121


In [16]:
# model_id = "meta-llama/Llama-2-13b-chat-hf"
model_id = "mistralai/mistral-7b-instruct-v0.1"

In [17]:
if torch.cuda.is_available():
    device = torch.device("cuda") 
else:
    device = torch.device("cpu")

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [19]:
model = AutoModelForCausalLM.from_pretrained(model_id).to(device)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.33s/it]


In [20]:
hf_pipeline = pipeline(
            "text-generation",
            model=model.eval(),
            tokenizer=tokenizer,
            use_cache=True,
            max_new_tokens=1000,
            top_k=10,
            top_p=0.95,
            typical_p=0.95,
            do_sample=True,
            temperature=0.1,
            repetition_penalty=1.03,
            num_return_sequences=1,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.eos_token_id,
            device_map="auto",
            device="cuda"
        )


Both `device` and `device_map` are specified. `device` will override `device_map`. You will most likely encounter unexpected behavior. Please remove `device` and keep `device_map`.


In [34]:
conn = http.client.HTTPConnection('api.mediastack.com')

params = urllib.parse.urlencode({
    'access_key': access_key,
    'categories': '-general,-sports',
    'sort': 'published_desc',
    'limit': 10,
    'countries': 'us'
    })

conn.request('GET', '/v1/news?{}'.format(params))

res = conn.getresponse()
data = res.read()

In [35]:
data = data.decode('utf-8')
data = json.loads(data)


In [36]:
articles = data['data']

In [37]:
articles[0]

{'author': None,
 'title': 'DWS Strategic Municipal Income Trust declares $0.026 dividend',
 'description': 'DWS Strategic Municipal Income Trust declares $0.026 dividend',
 'url': 'https://seekingalpha.com/news/4054359-dws-strategic-municipal-income-trust-declares-0026-dividend?utm_source=feed_news_all&utm_medium=referral&feed_item_type=news',
 'source': 'Seeking Alpha',
 'image': None,
 'category': 'business',
 'language': 'en',
 'country': 'us',
 'published_at': '2024-01-11T14:16:39+00:00'}

In [25]:
knn_index = "data/knn.index"
wiki_en = "data/wikipedia-en.parquet"  
wiki_en_sentences = "data/wikipedia-en-sentences.parquet"

In [26]:
#The below code is adapted from the project openchatkit(https://github.com/togethercomputer/OpenChatKit) retrival agumentataion,

def mean_pooling(token_embeddings, mask):
    token_embeddings = token_embeddings.masked_fill(~mask[..., None].bool(), 0.)
    sentence_embeddings = token_embeddings.sum(dim=1) / mask.sum(dim=1)[..., None]
    return sentence_embeddings

def cos_sim_2d(x, y):
    norm_x = x / np.linalg.norm(x, axis=1, keepdims=True)
    norm_y = y / np.linalg.norm(y, axis=1, keepdims=True)
    return np.matmul(norm_x, norm_y.T)


class WikipediaIndex:
    def __init__(self):
        indexpath = knn_index
        wiki_sentence_path = wiki_en_sentences

        self._tokenizer = AutoTokenizer.from_pretrained('facebook/contriever-msmarco')
        self._contriever = AutoModel.from_pretrained('facebook/contriever-msmarco').to(device)

        self._df_sentences = pd.read_parquet(wiki_sentence_path, engine='fastparquet')

        self._wiki_index = faiss.read_index(indexpath, faiss.IO_FLAG_MMAP | faiss.IO_FLAG_READ_ONLY)


    def search(self, query, k=1, w=5, w_th=0.5):
        inputs = self._tokenizer(query, padding=True, truncation=True, return_tensors='pt').to(device)
        outputs = self._contriever(**inputs)
        embeddings = mean_pooling(outputs[0], inputs['attention_mask'])
        
        query_vector = embeddings.cpu().detach().numpy().reshape(1, -1)
        
        distances, indices = self._wiki_index.search(query_vector, k)  

        try:
            input_texts = self._df_sentences.iloc[indices[0]]['text_snippet'].values[0]

        except Exception as e:
            print(e)
                    
        return input_texts



In [38]:
wp = WikipediaIndex()

for article in articles:
    title = article['title']
    description = article['description']
    context = wp.search(title)
    prompt = "[INST] Give this description: " + description + " and this context: "+ context  + ": enhance the description with the provided context and generate a meanigful sentence [/INST] answer:"
    answer = hf_pipeline(prompt)
    print("title: ",  title)
    print("description: ", description)
    num = answer[0]['generated_text'].find("answer:")
    answer = answer[0]['generated_text'][num:]
    print("answer: ", answer)
    break
    
    

title:  DWS Strategic Municipal Income Trust declares $0.026 dividend
description:  DWS Strategic Municipal Income Trust declares $0.026 dividend
answer:  answer: The DWS Strategic Municipal Income Trust declared a final liquidating dividend of $0.026 per share, with a liquidating value of $46.34 per share. The remaining cash of $4,045,358 was distributed pro rata among the stockholders.


In [42]:
metric = load_metric("bleu")
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")

generated_tokens = tokenizer(answer, return_tensors="pt")
reference_tokens = tokenizer(description, return_tensors="pt")

  metric = load_metric("bleu")
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Downloading builder script: 6.06kB [00:00, 5.04MB/s]                   
Downloading extra modules: 4.07kB [00:00, 3.98MB/s]                   
config.json: 100%|██████████| 1.72k/1.72k [00:00<00:00, 128kB/s]
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 3.36MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 30.6MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 4.20MB/s]


In [44]:
generated_tokens

{'input_ids': tensor([[    0, 27740,    35,    20,   211, 13691, 14152, 11660,  9628,  3101,
          2998,    10,   507,  6936,  1295,  2252,     9,    68,   288,     4,
         40521,   228,   458,     6,    19,    10,  6936,  1295,   923,     9,
            68,  3761,     4,  3079,   228,   458,     4,    20,  2405,  1055,
             9,    68,   306,     6, 39664,     6, 34392,    21,  7664,  1759,
           910,  2186,   566,     5,   388,  7509,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [45]:
score = metric.compute(predictions=[generated_tokens.input_ids], references=[reference_tokens.input_ids])

In [46]:
score


{'bleu': 0.0,
 'precisions': [0.0, 0.0, 0.0, 0.0],
 'brevity_penalty': 2.2603294069810542e-06,
 'length_ratio': 0.07142857142857142,
 'translation_length': 1,
 'reference_length': 14}