In [1]:
####### FOR THE SAKE OF TESTING
####### START FROM LOAD_INDEX() METHOD AND KEEP EXECUTING CELLS BELOW IT

In [32]:
import openai
from openai.embeddings_utils import get_embedding

import pandas as pd
import numpy as np
import pyarrow
import pinecone

from transformers import GPT2TokenizerFast

from sys import getsizeof
from tqdm.auto import tqdm
import json
import time

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

In [4]:
############################### PREPARING THE CONTEXT AND CREATING VECTORS ################################

In [35]:
######## CREATING CONTEXT FOR EVERY COURSE AND MODULES ##########################
df_Module_Description = pd.read_excel('C:\Projects\Recommendation Engine\Recommendation_Engine\data\Course_Module_New.xlsx')
df_Course_Description = pd.read_excel('C:\Projects\Recommendation Engine\Recommendation_Engine\data\CoursesAndModules.xlsx')
df_Module_Description['Context'] = "Name of Course offered is " + df_Module_Description.Courses + " and the module offered in the course is "  + df_Module_Description.Modules +  ". The details of the module is as follows. " + df_Module_Description.Summary 
final_df = pd.concat([df_Module_Description, df_Course_Description], axis=0, ignore_index=True)
final_df['n_tokens'] = final_df.Context.apply(lambda x: len(tokenizer.encode(x)))
final_df.to_excel('../data/Temp.xlsx')

In [36]:
print(sum(final_df.n_tokens))
total_cost = (sum(final_df.n_tokens) * 0.02 )/1000
print("Total cost to train with the given summary is " , total_cost)

10679
Total cost to train with the given summary is  0.21358000000000002


In [37]:
######## RUNNING THIS CELL WILL COST MONEY 


openai.api_key = 'ENTER OPEN AI KEY'
count = 0
davinci_embed_array = []
for index, row in final_df.iterrows():
    count += 1
    davinci_embed_array.append(get_embedding(row['Context'], engine='text-search-davinci-doc-001'))
    if count == 50:
        time.sleep(62)
        count = 0 
    
final_df.insert(5, "embeddings", davinci_embed_array, True) 

In [38]:
final_df['id'] = [str(i) for i in range(len(final_df))]
final_df.head(70)
final_df.to_parquet('C:\\Users\\shiva\\embd.parquet')
final_df.head()

Unnamed: 0,Courses,Modules,Summary,Context,n_tokens,embeddings,id
0,Data Analytics,Introduction to Data Analytics,The Introduction to Data Analytics module prov...,Name of Course offered is Data Analytics and t...,174,"[0.006504377815872431, 0.0025779546704143286, ...",0
1,Data Analytics,Data Visualization,The Data Visualization module is designed to p...,Name of Course offered is Data Analytics and t...,167,"[0.0037720000836998224, 0.004168551880866289, ...",1
2,Data Analytics,Dimension Reduction,The Dimension Reduction module focuses on tack...,Name of Course offered is Data Analytics and t...,194,"[0.001655714469961822, 0.001211653114296496, -...",2
3,Data Analytics,Performance Evaluation,The Performance Evaluation module is dedicated...,Name of Course offered is Data Analytics and t...,190,"[-0.006592290475964546, 0.003556283889338374, ...",3
4,Data Analytics,Multiple Linear Regression,The Multiple Linear Regression module focuses ...,Name of Course offered is Data Analytics and t...,164,"[-0.0027855588123202324, 0.0035109149757772684...",4


In [39]:
pinecone.init(
    api_key='ENTER PINECONE KEY', 
    environment='us-east1-gcp'
)

index_name = 'chatgpt-demo'

if not index_name in pinecone.list_indexes():
    pinecone.create_index(
        index_name, dimension=len(df['embeddings'].tolist()[0]),
        metric='cosine'
    )
    
index = pinecone.Index(index_name)

In [40]:
batch_size = 32

for i in tqdm(range(0, len(final_df), batch_size)):
    i_end = min(i+batch_size, len(final_df))
    df_slice = final_df.iloc[i:i_end]
    to_upsert = [
        (
            row['id'],
            row['embeddings'],
            {
                'course': row['Courses'],
                'module': row['Modules'],
                'n_tokens': row['n_tokens']
            }
        ) for _, row in df_slice.iterrows()
    ]
    # print(to_upsert)
    index.upsert(vectors=to_upsert)

  0%|          | 0/2 [00:00<?, ?it/s]

In [41]:
mappings = {row['id']: row['Context'] for _, row in final_df[['id', 'Context']].iterrows()}

In [51]:
with open('./mapping.json', 'w') as fp:
    json.dump(mappings, fp)

In [43]:
############################## IMPLEMENTATION OF THE RECOMMENDATION ##############

In [44]:
def load_index():
    pinecone.init(
        api_key='ENTER PINECONE KEY',  # app.pinecone.io
        environment='us-east1-gcp'
    )

    index_name = 'chatgpt-demo'

    if not index_name in pinecone.list_indexes():
        raise KeyError(f"Index '{index_name}' does not exist.")

    return pinecone.Index(index_name)

In [45]:
index = load_index()

In [46]:
def create_context(question, index, max_len=7750):
    q_embed = get_embedding(question, engine='text-search-davinci-query-001')
    res = index.query(q_embed, top_k=5, include_metadata=True)
    

    cur_len = 0
    contexts = []

    for row in res['matches']:
        text = mappings[row['id']]
        cur_len += row['metadata']['n_tokens'] + 4
        if cur_len < max_len:
            contexts.append(text)
        else:
            cur_len -= row['metadata']['n_tokens'] + 4
            if max_len - cur_len < 200:
                break
    return "\n\n###\n\n".join(contexts)

In [47]:
instructions = {
    "conservative Q&A": "Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nAnswer:",
    "paragraph about a question":"Write a paragraph, addressing the question, and use the text below to obtain relevant information\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nParagraph long Answer:",
    "bullet point": "Write a bullet point list of possible answers, addressing the question, and use the text below to obtain relevant information\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nBullet point Answer:",
    "summarize problems given a topic": "Write a summary of the problems addressed by the questions below\"\n\n{0}\n\n---\n\n",
    "just instruction": "{1} given the common questions and answers below \n\n{0}\n\n---\n\n",
    "summarize": "Write an elaborate, paragraph long summary about \"{1}\" given the questions and answers from a public forum on this topic\n\n{0}\n\n---\n\nSummary:",
    "experiment":"Write precise answer of the question asked, in bullet points with all possible list of answers and use the text below to obtain relevant information\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\exact answer:"
}

In [48]:
def answer_question(
    index=index,
    fine_tuned_qa_model="text-davinci-003",
    question="What questions can you answer ?",
    instruction="Answer the question based on the context below, and if the question can't be answered based on the context, say \"Please provide correct question\"\n\nContext:\n{0}\n\n---\n\nQuestion: {1}\nAnswer:",
    max_len=7550,
    debug=False,
    max_tokens=400,
    stop_sequence=None,
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(question,index,max_len=max_len)
    if debug:
        print("Context:\n" + context)
        print("\n\n")
    try:
        # fine-tuned models requires model parameter, whereas other models require engine parameter
        model_param = (
            {"model": fine_tuned_qa_model}
            if ":" in fine_tuned_qa_model
            and fine_tuned_qa_model.split(":")[1].startswith("ft")
            else {"engine": fine_tuned_qa_model}
        )
        #print(instruction.format(context, question))
        response = openai.Completion.create(
            prompt=instruction.format(context, question),
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
            **model_param,
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(e)
        return ""

In [50]:
print("Chatbot: Hello! How can I assist you today?")

while True:
    user_input = input("User: ")

    if user_input.lower() == "exit":
        print("Chatbot: Goodbye!")
        break

    # Generate a response from the chatbot
    response = answer_question(index,question=user_input,instruction = instructions["experiment"], debug=False)
    print("Chatbot:", response)
    print(" ")

Chatbot: Hello! How can I assist you today?


User:  What are the courses offered in the program ? 


Chatbot: - Data Analytics 
- Data Visualization 
- Algorithms and Optimization 
- Data Management for Analytics 
- Sensor Analytics 
- Cyber Manufacturing Systems 
- Robotics and Automation 
- Ethics, Privacy and Cybersecurity
 


User:  Can you explain what is taught in Sensor Analytics ?


Chatbot: Sensor Analytics is a course that covers the fundamentals of sensor signal processing and its applications in modern manufacturing and Industry 4.0 contexts. The course consists of the following modules:

- Course Overview: Focuses on the role of sensors and sensor analytics in modern manufacturing and Industry 4.0.
- Measurement Systems: Provides a comprehensive understanding of measurement systems, encompassing sensor types, operational principles, and attributes.
- Basics of Signal Processing: Introduces participants to the various methods of representing signals in different domains.
- Time Series Signal Analysis and Prediction (I): Covers topics such as time domain analysis for sensor signals in both discrete and continuous forms, Fourier transform for frequency domain analysis, inverse Fourier transform for signal reconstruction and noise filtering, and the use of time-frequency domain analysis tools for signal processing.
- Time Series Signal Analysis and Prediction (II

User:  What is Regression and Correlation ? 


Chatbot: - Regression is a statistical technique used to analyze the relationship between two or more variables. It is used to predict the value of one variable based on the values of other variables.

- Correlation is a measure of the strength and direction of the relationship between two or more variables. It is used to assess the degree to which two variables are related.
 


User:  Can you explain in brief what is taught in regresssion and correlation ? 


Chatbot: - Regression and Correlation Analysis module focuses on the analysis of data using regression and correlation techniques. 
- Participants will learn about linear regression and least squares methods for solving bivariate and multivariate regression problems. 
- They will gain the knowledge and skills to apply these techniques to analyze relationships between variables and make predictions. 
- Additionally, the module covers the computation of correlations among multiple sensor data series, enabling participants to assess the strength and direction of relationships between different variables. 
- By the end of the module, participants will be equipped with the tools to perform regression analysis and compute correlations, allowing them to gain valuable insights from their data and make informed decisions based on the observed relationships.
 


User:  exit


Chatbot: Goodbye!
