<a href="https://colab.research.google.com/github/starss9/llmchatbot/blob/main/LLMgenai.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

 #Step 1: Installing Required Packages

!pip install -U google-generativeai langchain langchain-community langchain-google-genai chromadb



# Step 2: Configuring Gemini API Key

import google.generativeai as genai
import os

# Pasting my Gemini API key
os.environ["GOOGLE_API_KEY"] = "AIzaSyDub8ExHzB61WQpD-1RfCNq98uCDIWbU5g"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])


#  Step 3: Uploading my CSV File/ dataset on social media addiction

from google.colab import files
uploaded = files.upload()


#  Step 4: Reading and Preprocessing the CSV

import pandas as pd
from langchain.docstore.document import Document

df.columns = df.columns.str.strip()

# Generating summary text per row
summaries = []
skipped_rows = 0

for index, row in df.iterrows():
    try:
        text = (
            f"Student ID {row['Student_ID']} is a {row['Age']}-year-old {row['Gender'].lower()} student "
            f"at the {row['Academic_Level']} level from {row['Country']}. They use social media for an average of "
            f"{row['Avg_Daily_Usage_Hours']} hours per day, primarily on {row['Most_Used_Platform']}. "
            f"Social media {'does' if str(row['Affects_Academic_Performance']).lower() == 'yes' else 'does not'} affect their academic performance. "
            f"They sleep about {row['Sleep_Hours_Per_Night']} hours per night and report a mental health score of {row['Mental_Health_Score']}. "
            f"Their relationship status is '{row['Relationship_Status']}', and they "
            f"{'have' if str(row['Conflicts_Over_Social_Media']).lower() == 'yes' else 'have not'} experienced conflicts over social media use. "
            f"Their social media addiction score is {row['Addicted_Score']}."
        )
        summaries.append(text)
    except Exception as e: # Catching specific exception and print error for debugging
        print(f"Skipping row {index} due to error: {e}")
        skipped_rows += 1
        continue  # Skipping rows with missing or malformed data


file_name = "Students Social Media Addiction.csv"
df = pd.read_csv(file_name)
documents = [Document(page_content=text) for text in summaries]
print(f"Prepared {len(documents)} documents.")
print(f" Skipped {skipped_rows} rows due to errors.")

if documents:
  print(" Sample:\n", documents[0].page_content[:500])
else:
  print("No documents were generated.")




# -----------------------------------
# 🔎 Step 5: Create Embeddings & Vector Store
# -----------------------------------
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import Chroma

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001", google_api_key=os.environ["GOOGLE_API_KEY"])
if documents:
   vectordb = Chroma.from_documents(documents, embeddings)
   retriever = vectordb.as_retriever()

else:
   print("Cannot create vector store as no documents were generated.")
   retriever = None


# -----------------------------------
# 🤖 Step 6: Set Up LangChain Gemini Chat Model
# -----------------------------------
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.chains import RetrievalQA

llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash", temperature=0)

# Create the RetrievalQA chain
if retriever:
 qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=retriever,
    return_source_documents=True
)
print("✅ RetrievalQA chain created.")

prompt = "make 5 questions related to social media addiction from this dataset "

response = qa_chain.invoke({'query':prompt})
print(" Generated Questions:\n", response['result'])




Collecting langchain-google-genai
  Using cached langchain_google_genai-2.1.4-py3-none-any.whl.metadata (5.2 kB)
INFO: pip is looking at multiple versions of langchain-google-genai to determine which version is compatible with other requirements. This could take a while.
  Using cached langchain_google_genai-2.1.3-py3-none-any.whl.metadata (4.7 kB)
  Using cached langchain_google_genai-2.1.2-py3-none-any.whl.metadata (4.7 kB)
  Using cached langchain_google_genai-2.1.1-py3-none-any.whl.metadata (4.7 kB)
  Using cached langchain_google_genai-2.1.0-py3-none-any.whl.metadata (3.6 kB)
  Using cached langchain_google_genai-2.0.11-py3-none-any.whl.metadata (3.6 kB)


Saving Students Social Media Addiction.csv to Students Social Media Addiction (23).csv
Prepared 705 documents.
 Skipped 0 rows due to errors.
🔹 Sample:
 Student ID 1 is a 19-year-old female student at the Undergraduate level from Bangladesh. They use social media for an average of 5.2 hours per day, primarily on Instagram. Social media does affect their academic performance. They sleep about 6.5 hours per night and report a mental health score of 6. Their relationship status is 'In Relationship', and they have not experienced conflicts over social media use. Their social media addiction score is 8.
✅ RetrievalQA chain created.
 Generated Questions:
 Here are five questions related to social media addiction that can be answered using the provided dataset:

1. What is the difference in average daily social media usage between Student ID 574 and Student ID 125?

2. How does the reported mental health score correlate with the social media addiction score for the two students?

3. Does a hi