# Combine Data and embed 

    This notebook is used as a playground
    

In [1]:
import os
import statistics
import json
from pathlib import Path
import pandas as pd
import tiktoken
from dotenv import load_dotenv
load_dotenv('../.env') 

True

In [2]:
## For embeddings
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector

## Loaders
from langchain.document_loaders import DataFrameLoader

## Combine Data

In [3]:
out_directory = Path("../data")

### Read files into dataframes

In [5]:
kaggle_data = pd.read_csv(out_directory/'kaggle_tilak_summaries.csv', sep="|")
kaggle_data.columns

Index(['book_number', 'book_name', 'chapter_name', 'title', 'commentary',
       'text', 'section_number', 'chunk_id', 'source', 'num_tokens'],
      dtype='object')

In [6]:
tinytales_data = pd.read_csv(out_directory/'tiny_tales_summaries.csv', sep="|")
tinytales_data.columns

Index(['text', 'section_number', 'title', 'chapter_number', 'chapter_name',
       'chunk_id', 'source', 'num_tokens'],
      dtype='object')

In [7]:
wikipedia_data = pd.read_csv(out_directory/'wikipedia_parva_summaries.csv', sep="|")

## Droping unnecessary columns
wikipedia_data.drop(['start_chapter', 'end_chapter'], axis=1, inplace=True)
wikipedia_data.columns

Index(['book', 'source', 'title', 'book_number', 'description', 'text',
       'chunk_id', 'num_tokens'],
      dtype='object')

### Combine the dataframes into one big dataframe

In [8]:
df_combined = pd.concat([kaggle_data, tinytales_data, wikipedia_data])
print(
    "Kaggle data dims",  kaggle_data.shape, "\n",
    "TinyTales data dims", tinytales_data.shape, "\n",
    "Wikipedia data dims", wikipedia_data.shape, "\n",
    "Final data dims", df_combined.shape)

print("Final data columns \n", df_combined.columns)

Kaggle data dims (2376, 10) 
 TinyTales data dims (200, 8) 
 Wikipedia data dims (19, 8) 
 Final data dims (2595, 13)
Final data columns 
 Index(['book_number', 'book_name', 'chapter_name', 'title', 'commentary',
       'text', 'section_number', 'chunk_id', 'source', 'num_tokens',
       'chapter_number', 'book', 'description'],
      dtype='object')


### Write the final dataframe into a csv file

In [9]:

df_combined.to_csv(out_directory/'summaries_combined.csv', index=False, sep="|")
print(df_combined.dtypes)

book_number       float64
book_name          object
chapter_name       object
title              object
commentary         object
text               object
section_number    float64
chunk_id           object
source             object
num_tokens          int64
chapter_number    float64
book               object
description        object
dtype: object


## Embedd and persist into PG Vector store

### Load Vector store

In [54]:
text_embedding_model = "text-embedding-ada-002"

CONNECTION_STRING = PGVector.connection_string_from_db_params(
    driver="psycopg2",
    host="localhost",
    port="5432",
    database=os.environ["PGVECTOR_DATABASE"],
    user=os.environ["PGVECTOR_USER"],
    password=os.environ["PGVECTOR_PASSWORD"],
)

COLLECTION_NAME = "mh_embeddings_summaries"

embedding = OpenAIEmbeddings(model=text_embedding_model)

store = PGVector(
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
    embedding_function=embedding,
)

### Load the dataframe into a loader

In [55]:
loader = DataFrameLoader(df_combined, page_content_column="text")

In [56]:
docs = loader.load()

In [321]:
# docs[0].page_content

## Named Entity recognition 

## Helper functions

In [308]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline


In [315]:

## Helper funciton to combine tokens into names
def combine_tokens(ner_results):
    name = ""
    entities = []
    for res in ner_results:
        word = res['word']
        if word[0] == "▁":
            if not name == "":
                entities = entities + [{'name': name, 'entity': entity}]
            name = word[1:]
            entity = res['entity']
        elif not word in [',', "'", ".", 's', "'", ";", "(", ")"]:
            name = name + word
    
    ## append the last name
    entities = entities + [{'name': name, 'entity': entity}]
    ## Return
    return entities

## Get names entities
def recognise_named_entities(text, pipeline_model):
    ner_results = pipeline_model(text)
    return ner_results


In [1]:
text = docs[100].page_content
# print(text)


NameError: name 'docs' is not defined

## Roberta Named Entity

In [317]:
## Roberta based NER

roberta_tokenizer = AutoTokenizer.from_pretrained("2rtl3/mn-xlm-roberta-base-named-entity")
roberta_model = AutoModelForTokenClassification.from_pretrained("2rtl3/mn-xlm-roberta-base-named-entity")
nlp_roberta = pipeline("ner", model=roberta_model, tokenizer=roberta_tokenizer)
roberta_model.num_parameters()



277456901

In [318]:
ner_results = recognise_named_entities(text, nlp_roberta)
entities = combine_tokens(ner_results)
df_roberta = pd.DataFrame(entities)

# print(text)
# for entity in entities:
#     print(entity)
# for res in ner_results:
#     print(res)


## Arbert NER model

In [319]:
arbert_tokenizer = AutoTokenizer.from_pretrained("ArBert/albert-base-v2-finetuned-ner")
arbert_model = AutoModelForTokenClassification.from_pretrained("ArBert/albert-base-v2-finetuned-ner")
nlp_arbert = pipeline("token-classification", model=arbert_model, tokenizer=arbert_tokenizer)
arbert_model.num_parameters()

11099913

In [320]:
ner_results = recognise_named_entities(text, nlp_arbert)
entities = combine_tokens(ner_results)
df_arbert = pd.DataFrame(entities)
df_arbert = df_arbert.loc[df_arbert['entity'] != 'LABEL_0']

# print(text)
# for res in ner_results:
#     print(res)
# for entity in entities:
#     print(entity)


## IndicBert Model

In [274]:
# from transformers import AutoModel, AutoTokenizer
# import torch

# tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
# model = AutoModel.from_pretrained('ai4bharat/indic-bert')

# inputs = tokenizer("After Abhimanyu's marriage, there was royal festival and everyone was pleased", return_tensors="pt")

# with torch.no_grad():
#     outputs = model(**inputs)

# outputs.pooler_output.squeeze()
