## Install required libraries

In [18]:
pip install -q -r requirements.txt

[0mNote: you may need to restart the kernel to use updated packages.


## Importing Dependencies

In [1]:
# Import dependencies

import pandas as pd
import json
from langchain import HuggingFacePipeline
import transformers
import torch
from peft import PeftModel
from transformers import LlamaForCausalLM, LlamaTokenizer, pipeline
from langchain import PromptTemplate, LLMChain 
from langchain.chains import SimpleSequentialChain

  warn("The installed version of bitsandbytes was compiled without GPU support. "


/data/anaconda3/envs/liveproject/lib/python3.8/site-packages/bitsandbytes/libbitsandbytes_cpu.so: undefined symbol: cadam32bit_grad_fp32


2023-10-30 13:43:16.753723: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-10-30 13:43:16.819462: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Create llm pipeline and load other required files

In [2]:
# load model and tokenizer form HuggingFace

def load_model(model_name, token):
    
    model = LlamaForCausalLM.from_pretrained(     # loading model
        model_name,
        use_auth_token=token
    )
    
    tokenizer = LlamaTokenizer.from_pretrained(model_name, token=token) # load tokenizer

    return model, tokenizer

In [3]:
# create HuggingFace pipeline

def create_pipeline(model, tokenizer):
    pipe = pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
        torch_dtype=torch.bfloat16,
        trust_remote_code=True,
        device_map="auto",
        # max_length=1500,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
    )
    
    llm = HuggingFacePipeline(pipeline = pipe, model_kwargs = {'temperature':0.0})

    return llm

In [4]:
# Load and extract Level1 and Level2 categories

with open('categories_json(Level2)', 'r') as json_file:
    Level2_categories = json.load(json_file)

Level1_categories = list(Level2_categories.keys())

## Create prompts and llm_chains for Level 1 classification

In [14]:
def level1_chain(llm):
    #prompt1(class prediction) for Level 1 classification
    
    template1 = """
    ###Instruction: Categorize the given article into one of the given IAB categories. Your response should consist of only the full, spelled-out category name from the following category list. If your output is different from the given list, map it to one of the categories of the list.
    category list: ['academic interests',
     'automotive',
     'books and literature',
     'business and finance',
     'careers',
     'education',
     'family and relationships',
     'fine art',
     'food and drink',
     'health and medical services',
     'healthy living',
     'hobbies and interests',
     'home and garden',
     'medical health',
     'movies',
     'music and audio',
     'news and politics',
     'personal finance',
     'pets',
     'pharmaceuticals, conditions, and symptoms',
     'pop culture',
     'real estate',
     'shopping',
     'sports',
     'style and fashion',
     'technology and computing',
     'television',
     'travel',
     'video gaming']
    
    Please provide the category name only and ensure it is spelled out in full format. Your response should be a single category name from the given category list, without any additional information.
    
    ###Question: Categorize the given article into one of the given IAB categories: {input}
    
    ###Answer:
    """
    
    prompt1 = PromptTemplate(template = template1, 
                            input_variables = ['input'], 
    )
    llm_chain1 = LLMChain(prompt = prompt1, llm = llm) # llm_chain1 for Level1 classification
    
    
    #--------------------------------------------------------------------------------------------------------------------------------------------------------------
    
    
    #prompt2(output formatting) for Level 1 classification
    
    # template2 = """
    # ### Instruction: Find the IAB category from the input sentence or phrase. Please provide the category name only, without any additional information or punctuation. Follow the following steps:
    #                 Step 1: Find the category name from the input sentence or phrase.
    #                 Step 2: Check if your answer in step 1 belongs to the categories list. categories list: ['academic interests', 'automotive', 'books and literature', 'business and finance', 'careers', 'education', 'family and relationships', 'fine art', 'food and drink', 'health and medical services', 'healthy living', 'hobbies and interests', 'home and garden', 'medical health', 'movies', 'music and audio', 'news and politics', 'personal finance','pets', 'pharmaceuticals, conditions, and symptoms', 'pop culture', 'real estate', 'shopping', 'sports', 'style and fashion', 'technology and computing', 'television', 'travel','video gaming']
    #                 Step 3: If your answer is not in this list, replace your answer with the category that most relates to your answer from this list. categories list: ['academic interests', 'automotive', 'books and literature', 'business and finance', 'careers', 'education', 'family and relationships', 'fine art', 'food and drink', 'health and medical services', 'healthy living', 'hobbies and interests', 'home and garden', 'medical health', 'movies', 'music and audio', 'news and politics', 'personal finance','pets', 'pharmaceuticals, conditions, and symptoms', 'pop culture', 'real estate', 'shopping', 'sports', 'style and fashion', 'technology and computing', 'television', 'travel','video gaming']
                    
    # Your response should strictly be one of the given categories without any explanation and punctuation. Make sure your output belongs to the given category list and contains nothing other than the category name from the list.
    
    # ### Question: Find the IAB category from the sentence following the instructions: {input}
    
    # ### Answer:
    # """
    
    template2 = """
    ### Instruction: Map the input with the most related category from the categories list. Please provide the category name only, without any additional information or punctuation.
                    Categories list: ['academic interests', 'automotive', 'books and literature', 'business and finance', 'careers', 'education', 'family and relationships', 'fine art', 'food and drink', 'health and medical services', 'healthy living', 'hobbies and interests', 'home and garden', 'medical health', 'movies', 'music and audio', 'news and politics', 'personal finance','pets', 'pharmaceuticals, conditions, and symptoms', 'pop culture', 'real estate', 'shopping', 'sports', 'style and fashion', 'technology and computing', 'television', 'travel','video gaming']
                    
    Your response should be one of the given categories without any explanation and punctuation. Make sure your output belongs to the given category list and contains nothing other than the category name from the list.
    
    ### Question: Map the input to one of the categories in the categories list: {input}
    
    ### Answer:
    """
    
    prompt2 = PromptTemplate(template = template2, 
                            input_variables = ['input']
    )
    llm_chain2 = LLMChain(prompt = prompt2, llm = llm) # llm_chain2 for Level 1 output formatting
    
    
    #--------------------------------------------------------------------------------------------------------------------------------------------------------------
    
    
    # combining llm_chain1 and llm_chain2 into a Sequential chain
    
    chain1 = SimpleSequentialChain(chains=[llm_chain1, llm_chain2]
                                                )
    
    return chain1, llm_chain2

## Create prompts and llm_chains for Level 2 classification

<b>Level 2 prediction requires two inputs i.e. article text, and list of subcategories of the predicted category in Level 1. The PromptTemplate method from Langchain module only takes one input. So, for the prompts of this level, we will use f-string method.</b>

In [15]:
def level2_chain(llm):
    # Creating prompts and llm_chains for Level 2 prediction
    
    # As these prompts will make use f-string method, 
    # We will have to write prompts for classification and output formatting after getting results from Level 1 prediction.
    
    template3 = """
    {input}
    """
    
    prompt3 = PromptTemplate(template = template3,
                            input_variables = ['input']
    )
    llm_chain3 = LLMChain(llm = llm, prompt=prompt3) # llm_chain3 will be used with both level 2 prompts

    return llm_chain3

### Executing llm_chains

In level 2 classification, some categories may not have any subcategories e.g. "Shopping".

In [16]:
def predict(text, chain1,llm_chain2, llm_chain3):
    
    result1 = chain1.run(text) # predicting Level1 class
    resLevel1 = result1.strip().lower()

    res = ''.join(char for char in result1.lower().strip() if char.isalpha() or char.isspace())

    # Level 2 classification is totally dependent upon Level 1 classification.
    # So there are two possibilities: 
    # Either the Level 1 output is formatted properly which makes Level 2 classification smooth or Level 1 output is ill-formatted

    #if Level 1 output is in proper format
    if res in Level1_categories:
        cat = str(Level2_categories[res])
        if cat:
            # prompt 1 for Level2 classification
            level2prompt1 = f"""###Instruction: Your task is to categorize the given article into one of the given categories. Your response should be only the full, spelled-out category name from the following category list. If your output is different from the given list, map it to one of the related categories of the list: {cat} \nMake sure your output is one category from the above list.
            \n\n###Question: Categorize this given article into one of the given categories: {text}
            \n\n###Answer:
            """
            level2result1 = llm_chain3.run(level2prompt1) # predicting Level 2 class 

            # prompt 2 for Level2 output formatting
            level2prompt2 = f"""###Instruction: Your task is to find the category from the input sentence or phrase. Please provide the category name only, without any additional information or punctuation. If your output is different from the given list, map it to one of the realted categories of the list: {cat} \nMake sure your output is one category from the above list.
            \n\n###Question: Find the category from the input sentence or phrase following the instructions: {level2result1}
            \n\n###Answer:
            """
            level2result2 = llm_chain3.run(level2prompt2) # formatting the Level 2 output
            
            if level2result2 not in cat:
                level2result2 = llm_chain3.run(level2prompt2) # trying to format the output once more
                
            resLevel2 = level2result2.strip().lower()
        else:
            resLevel2 = resLevel1

    #if Level 1 output is not in proper format
    else:
        result1_1 = llm_chain2.run(result1) # try formatting the Level 1 output again
        res1 = ''.join(char for char in result1_1.lower().strip() if char.isalpha() or char.isspace())

        # If the output is formatted now, we repeat the same task as above
        # Else we return empty string as Level2 Output
        
        if res1 in Level1_categories:
            cat = str(Level2_categories[res1])
            if cat:
                # prompt 1 for Level2 classification
                level2prompt1 = f"""###Instruction: Your task is to categorize the given article into one of the given categories. Your response should be only the full, spelled-out category name from the following category list. If your output is different from the given list, map it to one of the ctagories of the list: {cat} \nMake sure your output is one category from the above list.
                \n\n###Question: Categorize this given article into one of the given categories: {text}
                \n\n###Answer:
                """
                level2result1 = llm_chain3.run(level2prompt1) # predicting Level 2 class

                # prompt 2 for Level2 output formatting
                level2prompt2 = f"""###Instruction: Your task is to find the category from the input sentence or phrase. Your response should be only the full, spelled-out category name from the following category list. If your output is different from the given list, map it to one of the categories of the list: {cat} \nMake sure your output is one category from the above list. Your response should strictly be the one of the categories from the list.
                \n\n###Question: Find the category from the input sentence or phrase following the instructions: {level2result1}
                \n\n###Answer:
                """
                level2result2 = llm_chain3.run(level2prompt2)
                
                if level2result2 not in cat:
                    level2result2 = llm_chain3.run(level2prompt2) # trying to format the output once more
                    
                resLevel2 = level2result2
            else:
                resLevel2 = res1
        else:
            resLevel2 = ""

    print(resLevel1 + "  >  " + resLevel2)

## Example Usage

In [8]:
text = """A group of banks that have been in talks to lend about $250 million each toward a syndicated loan of about $3.5 billion to refinance debt which the Adani Group took out to fund its purchase of Ambuja Cements Ltd. have received internal approval for the deal, according to people familiar with the matter.The three banks are Barclays Plc, Deutsche Bank AG and Standard Chartered Plc, said the people, asking not to be identified when discussing a private matter. They’re part of a larger consortium in talks for the syndicated loan to billionaire Gautam Adani’s conglomerate. Some institutions were in discussions to lend $400 million each, in what could become one of the biggest loan deals in Asia this year, Bloomberg reported last month citing people familiar with the matter. No new information was given on the status of that group of banks. The prospective deal adds to evidence that the group, with business interests stretching from ports to green energy, is closing the chapter on the allegations of malfeasance made by US shortseller Hindenburg Research earlier this year. The claims — repeatedly denied by Adani officials — caused a selloff in the stocks and bonds of group firms. Negotiations with global banks on debt refinancing stalled as some banks balked, Bloomberg reported in February. Spokespeople at Barclays, Deutsche and Standard Chartered all declined to comment when contacted by Bloomberg on Tuesday. A representative for Adani Group did not offer any immediate comment.Adani bought the India assets of Switzerland’s Holcim Ltd. in 2022, as the conglomerate looked to move beyond its core business of operating ports, power plants and coal mines and into areas like data centers, airports, digital services, retail and media. The transaction has not yet been finalized and the terms could still change."""
# category -> Business and Finance

In [17]:
# Preparations

modelName = "meta-llama/Llama-2-7b-chat-hf"
token = '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'

model, tokenizer = load_model(modelName, token)

llm = create_pipeline(model, tokenizer)
chain_Level1 , outputformat_chain= level1_chain(llm)
chain_Level2 = level2_chain(llm)

In [18]:
# Classification

predict(text, chain_Level1, outputformat_chain, chain_Level2)

business and finance  >  business
