In [1]:
import pandas as pd
import os
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain.schema import Document 

In [37]:
# Load the CSV file
df = pd.read_csv('brands_flattened_complete.csv', header=0, index_col=False, encoding='utf-8')

# Initialize an empty list to store the sentences
entire_text = []

# Iterate over each row in the DataFrame
for _, row in df.iterrows():
    # Start with the base sentence
    sentence = f"The {row['name-en']} brand has a model called {row['model-name-en']}, which is available as a {row['type-name-en']} type in the years {row['type-years']}."
    entire_text.append(sentence)

    # Add all features 
    for col in df.columns[df.columns.get_loc('Drive axle'):]:
        feature_value = row[col]
        if pd.notna(feature_value):  # Only include non-NaN values
            if row['type-name-en'] != "-": #if has type name 
                if feature_value is True:  
                    entire_text.append(f"the {row['type-name-en']} version of the {row['model-name-en']} model from the {row['name-en']} brand has {col}.")
                elif feature_value is False:  
                    entire_text.append(f"the {row['type-name-en']} version of the {row['model-name-en']} model from the {row['name-en']} brand doesn't have {col}.")
                else:  # For other values (e.g., strings, numbers)
                    entire_text.append(f"the {col} of the {row['type-name-en']} version of the {row['model-name-en']} model from the {row['name-en']} brand is {feature_value}.")
            else:
                if feature_value is True:  
                    entire_text.append(f"the {row['model-name-en']} model from the {row['name-en']} brand has {col}.")
                elif feature_value is False:  
                    entire_text.append(f"the {row['model-name-en']} model from the {row['name-en']} brand doesn't have {col}.")
                else:  # For other values (e.g., strings, numbers)
                    entire_text.append(f"the {col} of the {row['model-name-en']} model from the {row['name-en']} brand is {feature_value}. ")

# Print the first few sentences to verify
print(entire_text[:5])



['The audi brand has a model called Q4-e-tron, which is available as a - type in the years [2024].', 'the Drive axle of the Q4-e-tron model from the audi brand is Four-wheel drive. ', 'the Gearbox of the Q4-e-tron model from the audi brand is Single-speed automatic transmission. ', 'the Battery capacity of the Q4-e-tron model from the audi brand is 84.8. ', 'the Power of the Q4-e-tron model from the audi brand is 313 kW. ']


In [27]:
# Saving the list to a file
with open('entire_text.txt', 'w') as file:
    for item in entire_text:
        file.write(item + '\n')  # Write each string on a new line

In [3]:
len(entire_text)

1318

In [5]:
merged_text = []
for i in range(0, len(entire_text), 3):
    merged_text.append(" ".join(entire_text[i:i+3]))

print(merged_text[:5])
entire_text = merged_text

['The audi brand has a model called Q4-e-tron, which is available as a - type in the years [2024]. the Drive axle of the Q4-e-tron model from the audi brand is Four-wheel drive.  the Gearbox of the Q4-e-tron model from the audi brand is Single-speed automatic transmission. ', 'the Battery capacity of the Q4-e-tron model from the audi brand is 84.8.  the Power of the Q4-e-tron model from the audi brand is 313 kW.  the Speed of the Q4-e-tron model from the audi brand is 160. ', 'the Acceleration of the Q4-e-tron model from the audi brand is 6.8.  the Torque of the Q4-e-tron model from the audi brand is 472.  the Effective range of the Q4-e-tron model from the audi brand is 560. ', 'the Body type of the Q4-e-tron model from the audi brand is Small crossover size.  the Generation (Body code) of the Q4-e-tron model from the audi brand is Modular MEB.  the Length of the Q4-e-tron model from the audi brand is 4588. ', 'the Width of the Q4-e-tron model from the audi brand is 1865.  the Height 

In [7]:
len(entire_text)

440

In [19]:
import statistics

char_size = []
for text in merged_text:
    char_size.append(len(text))
print(f'max = {max(char_size)}, min = {min(char_size)}, mean = {statistics.mean(char_size)}')

max = 411, min = 86, mean = 280.4977272727273


In [21]:
token_size = []
for text in entire_text:
    tokens = text.split()
    token_size.append(len(tokens))
print(f'max = {max(token_size)}, min = {min(token_size)}, mean = {statistics.mean(token_size)}')

max = 77, min = 15, mean = 48.61363636363637


In [39]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=450, chunk_overlap=50)
documents = [Document(page_content=text) for text in entire_text]
chunks = text_splitter.split_documents(documents)

print(chunks[:3])
print("Number of chunks:", len(chunks))

[Document(metadata={}, page_content='The audi brand has a model called Q4-e-tron, which is available as a - type in the years [2024].'), Document(metadata={}, page_content='the Drive axle of the Q4-e-tron model from the audi brand is Four-wheel drive.'), Document(metadata={}, page_content='the Gearbox of the Q4-e-tron model from the audi brand is Single-speed automatic transmission.')]
Number of chunks: 1318


In [41]:
db_path = "db-3"  # Path to store the vector store

# Initialize the embedding model
embedding = OllamaEmbeddings(model="mxbai-embed-large")

# Create a Chroma vector store from the chunks
vector_store = Chroma.from_documents(
    documents=chunks, embedding=embedding, persist_directory=db_path
)
vector_store.persist()
print("Vector store created and persisted.")

Vector store created and persisted.


In [13]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain.prompts import PromptTemplate
from langchain_core.prompts import ChatPromptTemplate, MessagesPlaceholder
from langchain_ollama.llms import OllamaLLM

In [15]:
cached_llm = OllamaLLM(model="gemma3:1b")

In [29]:
def chat_with_llm(query):
    # Load the vector store and create the retriever
    vector_store = Chroma(persist_directory = db_path, embedding_function = embedding)
    retriever = vector_store.as_retriever(
        search_type="similarity_score_threshold",
        search_kwargs={"k": 15, "score_threshold": 0.4},
    )


    document_chain = create_stuff_documents_chain(cached_llm, raw_prompt)
    retrieval_chain = create_retrieval_chain(
        retriever=retriever,
        combine_docs_chain=document_chain,
    )
    # Get the answer and update chat history
    result = retrieval_chain.invoke({"input": query})    
    return {"answer": result["answer"]}


In [42]:
raw_prompt = PromptTemplate.from_template(
    """
    <s>[INST] You are an intelligent assistant specialized in providing detailed information about cars, their specifications, and features. 
    Your job is to answer user queries accurately and concisely based on the provided context. 
    If the context does not contain the required information, respond with "I'm sorry, I couldn't find the information in the provided context." 

    Examples:
    1. User Query: "What is the top speed of the Audi Q4-e-tron?"
       Context: "the Speed of the Q4-e-tron model from the audi brand is 160."
       Answer: "The top speed of the Audi Q4-e-tron is 160 km/h."

    2. User Query: "Does the Audi Q4-e-tron have Cruise control?"
       Context: "the Q4-e-tron model from the audi brand has Cruise control."
       Answer: "Yes, the Audi Q4-e-tron has Cruise control."

    3. User Query: "which cars don't have Cruise control?"
       Context: ["the Q4-e-tron model from the audi brand has Cruise control.", 
                   "the full version of the TT-convertible model from the audi brand doesn't have Cruise control.",
                   "the insignia model from the Opel brand doesn't have Cruise control.", 
                   "the Cruise control of the full version of the 4C model from the alfaromeo brand is custom.", ...]
       Answer: " the cars that don't have Cruise control are: 
       1- The full version of the TT-convertible model from the Audi brand.
       2- The insignia model from the Opel brand."

    4. User Query: "name the Alfa Romeo 4c types."
       Context: ["The alfaromeo brand has a model called 4c, which is available as a full type in the years [2016, 2015]",
                   "The alfaromeo brand has a model called 4c, which is available as a different type in the years [2016, 2015]",
                   "The alfaromeo brand has a model called mito, which is available as a full type in the years [2016, 2015]", ...]
       Answer: "The types of the Alfa Romeo 4C are as follows: 1. Full version, 2. Different version."

    5. User Query: "which brand make astra model?"
        Context: "The opel brand has a model called astra, ..."
        Answer: "The Opel brand makes the Astra model."

    6. User Query: "name models of the audi"
        Answer: "Here are the models of the Audi brand:
                1- Q4-e-tron
                2- Q5-e-tron
                3- Q5
                4- TT-convertible
                5- TT-coupe"
        
    [/INST]</s>
    [INST] User Query: {input}
           Relevant Context: {context}
           Your Answer:
    [/INST]
"""
)

In [45]:
def conversation():
    print("Starting conversation...\n")
    # Start asking questions based on the PDF content
    while True:
        query = input("\nYou: ")
        if query.lower() == "exit":
            break

        response = chat_with_llm(query)
        print("\nBot:", response["answer"])


In [47]:
conversation()

Starting conversation...




You:  what is the power of q5



Bot: The Power of the ۴۰ version of the Q5-e-tron model from the audi brand is 313.



You:  what is the rear wheel of mito



Bot: The rear wheel of the full version of the mito model from the alfaromeo brand is 235/35R19.



You:  exit
