## Install libraries

In [1]:
!pip install --upgrade gspread oauth2client
!pip install pinecone
!pip install --upgrade torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install sentence-transformers
!pip install streamlit
!pip install streamlit pyngrok
!pip install gradio pyngrok sentence-transformers pandas
!pip install dotenv



Collecting pinecone
  Downloading pinecone-7.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pinecone-plugin-assistant<2.0.0,>=1.6.0 (from pinecone)
  Downloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl.metadata (28 kB)
Collecting pinecone-plugin-interface<0.0.8,>=0.0.7 (from pinecone)
  Downloading pinecone_plugin_interface-0.0.7-py3-none-any.whl.metadata (1.2 kB)
Downloading pinecone-7.2.0-py3-none-any.whl (524 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m524.3/524.3 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_assistant-1.7.0-py3-none-any.whl (239 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m240.0/240.0 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pinecone_plugin_interface-0.0.7-py3-none-any.whl (6.2 kB)
Installing collected packages: pinecone-plugin-interface, pinecone-plugin-assistant, pinecone
Successfully installed pinecone-7.2.0 pinecone-plugin-assistant-1.7.0 pinecone-p

## Import libraries

In [2]:
import gspread
from oauth2client.service_account import ServiceAccountCredentials
from google.colab import files
from datetime import datetime
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer, util
import os
import pinecone
from pinecone import Pinecone
import gradio as gr
from sklearn.metrics.pairwise import cosine_similarity
from dotenv import load_dotenv

## Authenticate and connect to Google Sheets

Loading .env file with necessary creds

In [3]:
from google.colab import files
uploaded = files.upload()       #Manually upload your .env file here
load_dotenv('env')

service_account_file = os.getenv('GOOGLE_SERVICE_ACCOUNT')
email1 = os.getenv('EMAIL_1')
email2 = os.getenv('EMAIL_2')

Saving env to env


In [4]:

uploaded = files.upload() # Manually upload your service_account.json file here

scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name(service_account_file, scope)
client = gspread.authorize(creds)


# Function to create a new worksheet for each day
def create_daily_sheet():
    today_date = datetime.now().strftime("%Y-%m-%d")

    try:
        worksheet = spreadsheet.worksheet(today_date)
        print(f"Worksheet for {today_date} already exists.")
    except gspread.exceptions.WorksheetNotFound:
        worksheet = spreadsheet.add_worksheet(title=today_date, rows="100", cols="5")
        headers = ["Project Name", "Project ID", "Employee Names", "Updates", "Blockers/Queries"]
        worksheet.append_row(headers)
        print(f"New worksheet created for {today_date}.")

    return worksheet

spreadsheet = client.create("Employee Updates Tracker")
spreadsheet.share(email2, perm_type='user', role='writer')  # Share with your email or anyone who needs access
spreadsheet.share(email1, perm_type='user', role='writer')

daily_sheet = create_daily_sheet()

worksheet = spreadsheet.get_worksheet(0)
worksheet.update_title("Daily Updates")



print(f"Spreadsheet created: {spreadsheet.url}")


Saving scrum-call-assistant-5c4f244a5730.json to scrum-call-assistant-5c4f244a5730.json
New worksheet created for 2025-06-27.
Spreadsheet created: https://docs.google.com/spreadsheets/d/1f9LiIB5Kn7eZdezV8X4E3vTgnYJtwfWs56zO6qBGQBs


## Sample entries for the day

In [5]:
import random
from datetime import datetime, timedelta
import gspread
from oauth2client.service_account import ServiceAccountCredentials

# Authenticate with Google Sheets
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name(service_account_file, scope)
client = gspread.authorize(creds)

# Open the main spreadsheet and create or access today's worksheet
spreadsheet = client.open("Employee Updates Tracker")
today_date = datetime.now().strftime("%Y-%m-%d")

# Function to get or create the daily sheet
def get_or_create_daily_sheet():
    try:
        worksheet = spreadsheet.worksheet(today_date)
        print(f"Worksheet for {today_date} already exists.")
    except gspread.exceptions.WorksheetNotFound:
        worksheet = spreadsheet.add_worksheet(title=today_date, rows="100", cols="5")
        headers = ["Project Name", "Project ID", "Employee Names", "Updates", "Blockers/Queries"]
        worksheet.append_row(headers)
        print(f"New worksheet created for {today_date}.")
    return worksheet

worksheet = get_or_create_daily_sheet()

projects = ["Data Pipeline Enhancement", "Web App Redesign", "API Integration", "Machine Learning Model", "Backend Optimization"]
employees = ["Alice", "Bob", "Charlie", "Diana", "Evan"]
blockers = ["None", "Waiting for data access", "Dependency on API update", "Awaiting feedback", "Issue with deployment"]
updates = [
    "Completed initial setup.",
    "Working on API endpoints.",
    "Refactoring code for efficiency.",
    "Testing the latest model.",
    "Resolving deployment issues.",
    "Code review completed.",
    "Fixed bugs reported in QA.",
    "Integrating third-party API.",
    "Researching optimization techniques.",
    "Finalizing documentation."
]

for i in range(10):
    project = random.choice(projects)
    employee = random.choice(employees)
    update = random.choice(updates)
    blocker = random.choice(blockers)
    project_id = f"PID-{random.randint(1000, 9999)}"

    row = [project, project_id, employee, update, blocker]

    worksheet.append_row(row)
    print(f"Added entry: {row}")

print(f"Entries successfully added to the worksheet for {today_date}.")


Worksheet for 2025-06-27 already exists.
Added entry: ['Backend Optimization', 'PID-6994', 'Charlie', 'Resolving deployment issues.', 'Awaiting feedback']
Added entry: ['Backend Optimization', 'PID-3911', 'Alice', 'Resolving deployment issues.', 'Waiting for data access']
Added entry: ['API Integration', 'PID-6847', 'Diana', 'Working on API endpoints.', 'None']
Added entry: ['Web App Redesign', 'PID-5798', 'Alice', 'Integrating third-party API.', 'Issue with deployment']
Added entry: ['Web App Redesign', 'PID-4259', 'Bob', 'Finalizing documentation.', 'None']
Added entry: ['Backend Optimization', 'PID-9043', 'Bob', 'Integrating third-party API.', 'Issue with deployment']
Added entry: ['Machine Learning Model', 'PID-7205', 'Diana', 'Integrating third-party API.', 'None']
Added entry: ['Web App Redesign', 'PID-5027', 'Alice', 'Researching optimization techniques.', 'None']
Added entry: ['Backend Optimization', 'PID-5283', 'Bob', 'Researching optimization techniques.', 'Dependency on API 

## Script to fetch data

In [6]:
'''
COMPLETE CODE FOR A FILE ALREADY CREATED

import gspread
from oauth2client.service_account import ServiceAccountCredentials
from datetime import datetime
import pandas as pd

# Authenticate with Google Sheets
scope = ["https://spreadsheets.google.com/feeds", "https://www.googleapis.com/auth/drive"]
creds = ServiceAccountCredentials.from_json_keyfile_name(service_account_file, scope)
client = gspread.authorize(creds)

# Open the main spreadsheet
spreadsheet = client.open("Employee Updates Tracker")
'''

# Function to extract data from today's worksheet
def extract_today_data():
    today_date = datetime.now().strftime("%Y-%m-%d")
    try:
        worksheet = spreadsheet.worksheet(today_date)
        records = worksheet.get_all_records()         #get records as dictionaries
        print(f"Data for {today_date} extracted successfully.")
        data_df = pd.DataFrame(records)               #convert to dataframe
        return records, data_df
    except gspread.exceptions.WorksheetNotFound:
        print(f"No worksheet found for {today_date}.")
        return [], None

today_data_list, today_data_df = extract_today_data()

print("Today's Data List:")
print(today_data_list)

if today_data_df is not None:
    print("\nToday's Data Table:")
    print(today_data_df)


Data for 2025-06-27 extracted successfully.
Today's Data List:
[{'Project Name': 'Backend Optimization', 'Project ID': 'PID-6994', 'Employee Names': 'Charlie', 'Updates': 'Resolving deployment issues.', 'Blockers/Queries': 'Awaiting feedback'}, {'Project Name': 'Backend Optimization', 'Project ID': 'PID-3911', 'Employee Names': 'Alice', 'Updates': 'Resolving deployment issues.', 'Blockers/Queries': 'Waiting for data access'}, {'Project Name': 'API Integration', 'Project ID': 'PID-6847', 'Employee Names': 'Diana', 'Updates': 'Working on API endpoints.', 'Blockers/Queries': 'None'}, {'Project Name': 'Web App Redesign', 'Project ID': 'PID-5798', 'Employee Names': 'Alice', 'Updates': 'Integrating third-party API.', 'Blockers/Queries': 'Issue with deployment'}, {'Project Name': 'Web App Redesign', 'Project ID': 'PID-4259', 'Employee Names': 'Bob', 'Updates': 'Finalizing documentation.', 'Blockers/Queries': 'None'}, {'Project Name': 'Backend Optimization', 'Project ID': 'PID-9043', 'Employee

## Loading Embedding Model and Connecting to a Vector Database

In [7]:
#Load SBERT model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_environment = os.getenv("PINECONE_ENVIRONMENT")
os.environ["PINECONE_INDEX_NAME"] = "employee-updates-tracker"  # Ensure this line is run to set the index name

#Initialize Pinecone without using init()

index_name = os.getenv("PINECONE_INDEX_NAME", "employee-updates-tracker")  # Default to "text-embeddings" if not set
pc = Pinecone(api_key=pinecone_api_key, environment=pinecone_environment)

index = pc.Index(index_name)

# Access the index
index = pc.Index(index_name)

#Chunking
def chunk_text(text, max_chunk_size=100):
    return [text[i:i + max_chunk_size] for i in range(0, len(text), max_chunk_size)]

#Processing and embedding
def embed_daily_data(data_df):
    index_vectors = []
    for _, row in data_df.iterrows():
        combined_text = f"Project: {row['Project Name']} | Update: {row['Updates']} | Blockers: {row['Blockers/Queries']}"
        chunks = chunk_text(combined_text, max_chunk_size=100)
        for chunk in chunks:
            vector = model.encode(chunk).tolist()
            index_vectors.append((f"{row['Project ID']}_{chunk}", vector))

    index.upsert(vectors=index_vectors)
    print(f"{len(index_vectors)} vectors upserted to Pinecone index.")

def query_employee_updates(query, top_k=10):
    query_chunks = chunk_text(query, max_chunk_size=100)
    query_embeddings = [model.encode(chunk) for chunk in query_chunks]

    results = []
    for embedding in query_embeddings:
        pinecone_results = index.query(vector=embedding.tolist(), top_k=top_k, include_metadata=True)
        results.extend(pinecone_results['matches'])

    relevant_rows = []
    for match in results:
        print("Match found:", match)  # Debug to see each match
        project_id = match['id'].split("_")[0]
        row = today_data_df[today_data_df['Project ID'] == project_id]
        if not row.empty:
            relevant_rows.append(row)

    return pd.concat(relevant_rows).drop_duplicates() if relevant_rows else pd.DataFrame()

embed_daily_data(today_data_df)

query_text = "Issue with deployment"
print("\nRunning query:", query_text)

relevant_data = query_employee_updates(query_text)

if not relevant_data.empty:
    print("\nRelevant Data:")
    print(relevant_data)
else:
    print("No relevant data found.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

14 vectors upserted to Pinecone index.

Running query: Issue with deployment
Match found: {'id': 'PID-3328_deployment', 'score': 0.8172, 'values': []}
Match found: {'id': 'PID-1681_Project: Backend Optimization | Update: Resolving deployment '
       'issues. | Blockers: Issue with deployme',
 'score': 0.57077986,
 'values': []}
Match found: {'id': 'PID-9142_Project: Backend Optimization | Update: Code review '
       'completed. | Blockers: Issue with deployment',
 'score': 0.532700837,
 'values': []}
Match found: {'id': 'PID-9945_Project: Backend Optimization | Update: Code review '
       'completed. | Blockers: Issue with deployment',
 'score': 0.532700837,
 'values': []}
Match found: {'id': 'PID-6076_Project: Backend Optimization | Update: Code review '
       'completed. | Blockers: Issue with deployment',
 'score': 0.532700837,
 'values': []}
Match found: {'id': 'PID-4772_Project: Backend Optimization | Update: Code review '
       'completed. | Blockers: Issue with deployment',

## Integrating LLM and Streamlit UI

In [8]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

if 'embedding' not in today_data_df.columns:
    today_data_df['embedding'] = today_data_df['Updates'].apply(lambda x: model.encode(x))
else:
    print("Embeddings already exist in the DataFrame.")


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to embed a query and find relevant data
def get_relevant_data(query_text):
    # Embed the query using the loaded model
    query_embedding = model.encode(query_text).reshape(1, -1)

    # Calculate cosine similarity between query and each row embedding
    df_embeddings = np.vstack(today_data_df['embedding'].values)
    similarities = cosine_similarity(query_embedding, df_embeddings).flatten()

    # Get top result(s) based on similarity
    top_indices = similarities.argsort()[-3:][::-1]  # Retrieve top 3 results
    relevant_rows = today_data_df.iloc[top_indices]
    return relevant_rows


In [10]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import gradio as gr
'''
# Define sample data
data = {
    "Project Name": [
        "API Integration", "Machine Learning Model", "Web App Redesign",
        "Backend Optimization", "Data Pipeline Enhancement"
    ],
    "Project ID": [
        "PID-4741", "PID-7014", "PID-6011", "PID-3843", "PID-3015"
    ],
    "Employee Names": [
        "Alice", "Bob", "Evan", "Diana", "Charlie"
    ],
    "Updates": [
        "Resolving deployment issues.", "Researching optimization techniques.",
        "Fixed bugs reported in QA.", "Completed initial setup.",
        "Integrating third-party API."
    ],
    "Blockers/Queries": [
        "Dependency on API update", "None", "Awaiting feedback",
        "Dependency on API update", "None"
    ]
}

# Create the DataFrame
today_data_df = pd.DataFrame(data)
'''
def chatbot_response(history, user_input):
    relevant_data = get_relevant_data(user_input)

    if relevant_data.empty:
        bot_reply = "No relevant data found."
    else:

        bot_reply = ""
        for _, row in relevant_data.iterrows():
            bot_reply += f"**Project Name**: {row['Project Name']}\n"
            bot_reply += f"**Project ID**: {row['Project ID']}\n"
            bot_reply += f"**Employee Name**: {row['Employee Names']}\n"
            bot_reply += f"**Update**: {row['Updates']}\n"
            bot_reply += f"**Blockers/Queries**: {row['Blockers/Queries']}\n\n"
        bot_reply = bot_reply.strip()
    history.append(["user", user_input])
    history.append(["bot", bot_reply])

    return history, history

#Gradio interface
with gr.Blocks() as demo:
    gr.Markdown("<h1 style='text-align: center;'>Employee Updates Chatbot</h1>")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(placeholder="Type a message...", label="Enter your query:")
    submit = gr.Button("Send")

    submit.click(fn=chatbot_response, inputs=[chatbot, msg], outputs=[chatbot, chatbot])

demo.launch(share=True)


  chatbot = gr.Chatbot()


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://a3398100977b66cf4e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


