# LAB 1: TOKEN ESTIMATOR

# ========================================
# STEP 1: INSTALL NECESSARY PACKAGES
# ========================================

In [None]:
print("üì¶ Installing packages...")
!pip install -q streamlit pyngrok

# ========================================
# STEP 2: CREATE THE STREAMLIT APP
# ========================================

In [None]:
print("üìù Creating application file...")

# Define helper functions
def count_words(text):
    """Count the number of words in the text."""
    words = text.split()
    return len(words)

def count_characters(text):
    """Count the number of characters in the text."""
    return len(text)

def estimate_tokens(text):
    """Estimate token count (1 token ‚âà 4 characters)."""
    char_count = count_characters(text)
    estimated_tokens = char_count / 4
    return int(estimated_tokens)

def check_character_limit(text, limit=4000):
    """Check the character limit."""
    char_count = count_characters(text)
    return char_count > limit

In [None]:
# Define the main function
def main():
    st.set_page_config(page_title="Token Estimator", page_icon="üìä", layout="wide")

    st.title("üìä Token Estimation Tool")
    st.markdown("""
    ### Upload and analyze your text file
    This tool:
    - Calculates word count
    - Calculates character count
    - Estimates token count (1 token ‚âà 4 characters)
    - Warns for 4000+ characters
    """)

    uploaded_file = st.file_uploader("Choose a text file", type=['txt', 'md', 'csv'])

    if uploaded_file is not None:
        try:
            content = uploaded_file.read().decode('utf-8')
            st.success(f"‚úÖ File '{uploaded_file.name}' uploaded successfully!")

            col1, col2, col3 = st.columns(3)

            word_count = count_words(content)
            char_count = count_characters(content)
            token_estimate = estimate_tokens(content)

            with col1:
                st.metric(label="üìù Word Count", value=word_count)
            with col2:
                st.metric(label="üî§ Character Count", value=char_count)
            with col3:
                st.metric(label="üéØ Estimated Tokens", value=token_estimate)

            if check_character_limit(content):
                st.error("‚ö†Ô∏è WARNING: This file exceeds 4000 characters!")
                st.warning(f"Your file contains {char_count} characters (limit: 4000)")
            else:
                st.info(f"‚úì File is within safe limits ({char_count}/4000 characters)")

            with st.expander("üìÑ File Content Preview"):
                st.text_area("Content", content, height=300, disabled=True)

            st.markdown("---")
            st.subheader("üìà Additional Statistics")

            col4, col5, col6 = st.columns(3)

            with col4:
                avg_word_length = char_count / word_count if word_count > 0 else 0
                st.metric("Avg Word Length", f"{avg_word_length:.2f} chars")
            with col5:
                lines = content.count('\n') + 1
                st.metric("Line Count", lines)
            with col6:
                avg_tokens_per_line = token_estimate / lines if lines > 0 else 0
                st.metric("Tokens Per Line", f"{avg_tokens_per_line:.1f}")

        except UnicodeDecodeError:
            st.error("‚ùå Error: Could not read file. Please upload a text-based file.")
        except Exception as e:
            st.error(f"‚ùå An error occurred: {str(e)}")

    st.markdown("---")
    st.markdown("""
    ### ‚ÑπÔ∏è How it works?

    **Token Estimation Formula:**
    ```
    Estimated Tokens = Character Count / 4
    ```

    **Note:** This is a simple estimation. Actual token count may vary based on:
    - The tokenizer used by the LLM
    - Language and character encoding
    - Special characters and code blocks
    """)

In [None]:
if __name__ == "__main__":
    main()

In [None]:
# Write the app to a file
app_code = f'''
import streamlit as st

{count_words.__code__.co_name} = {count_words}
{count_characters.__code__.co_name} = {count_characters}
{estimate_tokens.__code__.co_name} = {estimate_tokens}
{check_character_limit.__code__.co_name} = {check_character_limit}
{main.__code__.co_name} = {main}

if __name__ == "__main__":
    main()
'''

with open('lab1_app.py', 'w') as f:
    f.write(app_code)

print("‚úÖ Application file created!")

# ========================================
# STEP 3: NGROK SETUP
# ========================================

In [None]:
print("\nüîß Setting up Ngrok...")
print("‚ö†Ô∏è Get your free ngrok token here: https://dashboard.ngrok.com/get-started/your-authtoken\n")

from pyngrok import ngrok
import getpass

ngrok_token = getpass.getpass('Enter Ngrok token: ')
ngrok.set_auth_token(ngrok_token)

# ========================================
# STEP 4: START STREAMLIT IN BACKGROUND
# ========================================

In [None]:
print("\nüöÄ Starting Streamlit...")

import os
os.system('streamlit run lab1_app.py --server.port 8501 --server.headless true > /dev/null 2>&1 &')

import time
time.sleep(10)

# ========================================
# STEP 5: CREATE NGROK TUNNEL
# ========================================

In [None]:
print("üåê Creating Ngrok tunnel...\n")

public_url = ngrok.connect(8501)

print("=" * 70)
print(f"\n‚úÖ SUCCESS! Your app is ready:\n")
print(f"   üîó {public_url}\n")
print("=" * 70)
print("\nüìå IMPORTANT:")
print("   1. Click the link above")
print("   2. Upload a text file")
print("   3. See the results!")
print("\n‚ö†Ô∏è Keep this cell running")

try:
    import threading
    event = threading.Event()
    event.wait()
except KeyboardInterrupt:
    print("\nüëã Shutting down...")
    ngrok.disconnect(public_url)

# LAB 2A: SIMPLE RAG - TF-IDF & COSINE SIMILARITY 

# ========================================
# STEP 1: INSTALL REQUIRED PACKAGES
# ========================================

In [None]:
print("üì¶ Installing packages...")
!pip install -q streamlit pyngrok

# ========================================
# STEP 2: CREATE THE STREAMLIT APP
# ========================================

In [None]:
print("üìù Creating application file...")

# Import required libraries
import streamlit as st
import math
from collections import Counter

class SimpleTFIDF:
    """TF-IDF implementation (without external libraries)."""

    def __init__(self):
        self.documents = []
        self.document_names = []
        self.vocabulary = set()
        self.idf_scores = {}
        self.document_vectors = []

In [None]:
    def add_document(self, text, name):
        """Add document to the collection."""
        self.documents.append(text.lower())
        self.document_names.append(name)

    def tokenize(self, text):
        """Simple tokenization."""
        for char in '.,!?;:\"()[]{}':
            text = text.replace(char, ' ')
        return text.lower().split()

In [None]:
    def calculate_tf(self, document):
        """Calculate Term Frequency."""
        tokens = self.tokenize(document)
        token_count = len(tokens)
        term_counts = Counter(tokens)
        tf = {}
        for term, count in term_counts.items():
            tf[term] = count / token_count if token_count > 0 else 0
        return tf

    def calculate_idf(self):
        """Calculate Inverse Document Frequency."""
        num_documents = len(self.documents)
        df = Counter()
        for document in self.documents:
            unique_terms = set(self.tokenize(document))
            for term in unique_terms:
                df[term] += 1
                self.vocabulary.add(term)
        for term in self.vocabulary:
            self.idf_scores[term] = math.log(num_documents / (1 + df[term]))

In [None]:
    def calculate_tfidf_vector(self, document):
        """Calculate TF-IDF vector."""
        tf = self.calculate_tf(document)
        tfidf = {}
        for term in self.vocabulary:
            tf_score = tf.get(term, 0)
            idf_score = self.idf_scores.get(term, 0)
            tfidf[term] = tf_score * idf_score
        return tfidf

    def build_vectors(self):
        """Build TF-IDF vectors for all documents."""
        self.calculate_idf()
        for document in self.documents:
            vector = self.calculate_tfidf_vector(document)
            self.document_vectors.append(vector)

    def cosine_similarity(self, vector1, vector2):
        """Calculate Cosine Similarity."""
        dot_product = sum(vector1.get(term, 0) * vector2.get(term, 0) for term in self.vocabulary)
        magnitude1 = math.sqrt(sum(value ** 2 for value in vector1.values()))
        magnitude2 = math.sqrt(sum(value ** 2 for value in vector2.values()))
        if magnitude1 == 0 or magnitude2 == 0:
            return 0
        return dot_product / (magnitude1 * magnitude2)

In [None]:
    def find_most_relevant(self, query):
        """Find the most relevant documents."""
        query_vector = self.calculate_tfidf_vector(query.lower())
        similarities = []
        for i, doc_vector in enumerate(self.document_vectors):
            similarity = self.cosine_similarity(query_vector, doc_vector)
            similarities.append((self.document_names[i], similarity, i))
        similarities.sort(key=lambda x: x[1], reverse=True)
        return similarities

In [None]:
def main():
    st.set_page_config(page_title="Simple RAG", page_icon="üîç", layout="wide")

    st.title("üîç Simple RAG: Document Search System")
    st.markdown("""
    ### Find the most relevant document for your query
    This system uses **TF-IDF** and **Cosine Similarity** to find the
    best matching document (without using an LLM!).
    """)

    if 'tfidf' not in st.session_state:
        st.session_state.tfidf = SimpleTFIDF()
        st.session_state.documents_processed = False

    st.subheader("üìÅ Step 1: Upload Documents")
    uploaded_files = st.file_uploader("Upload multiple text files",
                                     type=['txt', 'md'], accept_multiple_files=True)

In [None]:
    if uploaded_files:
        if st.button("Process Documents"):
            st.session_state.tfidf = SimpleTFIDF()
            with st.spinner("Processing documents..."):
                for uploaded_file in uploaded_files:
                    try:
                        content = uploaded_file.read().decode('utf-8')
                        st.session_state.tfidf.add_document(content, uploaded_file.name)
                    except Exception as e:
                        st.error(f"Error: {uploaded_file.name}: {str(e)}")
                st.session_state.tfidf.build_vectors()
                st.session_state.documents_processed = True

            st.success(f"‚úÖ {len(uploaded_files)} documents processed successfully!")

            st.subheader("üìä Document Summary")
            for i, name in enumerate(st.session_state.tfidf.document_names):
                word_count = len(st.session_state.tfidf.tokenize(
                    st.session_state.tfidf.documents[i]))
                st.write(f"- **{name}**: {word_count} words")

In [None]:
    if st.session_state.documents_processed:
        st.markdown("---")
        st.subheader("üîé Step 2: Enter Your Query")

        query = st.text_input("What are you looking for?",
                             placeholder="Example: Tell me about apples")

        if query:
            with st.spinner("Analyzing query..."):
                results = st.session_state.tfidf.find_most_relevant(query)

            st.subheader("üìà Results")

            if results:
                most_relevant = results[0]
                st.success(f"üéØ Most Relevant Document: **{most_relevant[0]}**")
                st.metric("Similarity Score", f"{most_relevant[1]:.4f}")

                doc_index = most_relevant[2]
                st.text_area("Document Content",
                           st.session_state.tfidf.documents[doc_index], height=200)

                st.markdown("---")
                st.subheader("üìä All Documents (Ranked)")

                for rank, (name, similarity, idx) in enumerate(results, 1):
                    with st.expander(f"#{rank} - {name} (Score: {similarity:.4f})"):
                        st.write(st.session_state.tfidf.documents[idx][:500] + "...")

                st.markdown("---")
                st.subheader("üìâ Similarity Scores")
                for name, similarity, idx in results:
                    st.progress(similarity, text=f"{name}: {similarity:.4f}")
    else:
        st.info("üëÜ Please upload and process documents first.")

In [None]:
    st.markdown("---")
    st.markdown("""
    ### ‚ÑπÔ∏è How it Works

    **TF-IDF:**
    - **TF**: Frequency of the term in the document
    - **IDF**: Importance of the term across all documents
    - **TF-IDF**: Combined score of the two

    **Cosine Similarity:**
    ```
    similarity = (A ¬∑ B) / (||A|| √ó ||B||)
    ```
    0 = No similarity, 1 = Identical
    """)

In [None]:
if __name__ == "__main__":
    main()

In [None]:
# Write the app to a file
app_code = f'''
import streamlit as st
import math
from collections import Counter

{SimpleTFIDF.__name__} = {SimpleTFIDF}
{main.__name__} = {main}

if __name__ == "__main__":
    main()
'''

with open('lab2a_app.py', 'w') as f:
    f.write(app_code)

print("‚úÖ Application file created!")

# ========================================
# STEP 3: NGROK SETUP
# ========================================

In [None]:
print("\nüîß Setting up Ngrok...")
print("‚ö†Ô∏è Free token: https://dashboard.ngrok.com/get-started/your-authtoken\n")

from pyngrok import ngrok
import getpass

ngrok_token = getpass.getpass('Enter your Ngrok token: ')
ngrok.set_auth_token(ngrok_token)

# ========================================
# STEP 4: START STREAMLIT
# ========================================

In [None]:
print("\nüöÄ Starting Streamlit...")

import os
os.system('streamlit run lab2a_app.py --server.port 8501 --server.headless true > /dev/null 2>&1 &')

import time
time.sleep(10)

# ========================================
# STEP 5: CREATE TUNNEL
# ========================================

In [None]:
print("üåê Creating Ngrok tunnel...\n")

public_url = ngrok.connect(8501)

print("=" * 70)
print(f"\n‚úÖ SUCCESS! Your app is ready:\n    üîó {public_url}\n")
print("=" * 70)
print("\nüìå IMPORTANT:")
print("    1. Click the link above")
print("    2. Upload multiple text files")
print("    3. Click 'Process Documents'")
print("    4. Enter a query to find the best match!")
print("\n‚ö†Ô∏è Keep this cell running to maintain the connection\n")

try:
    import threading
    event = threading.Event()
    event.wait()
except KeyboardInterrupt:
    print("\nüëã Shutting down...")
    ngrok.disconnect(public_url)