In [2]:
# Project Creator: Sim Wang

# If you want to see the code details, please kindly navigate to the src/ folder to review the code.
# I have imported all functions from my code here for your testing convenience

# If you are failing to run the code below, you might not have followed the guidelines in the README
# YOU MUST CREATE A CONDA ENVIRONMENT IN YOUR TERMINAL BEFORE YOU RUN ANYTHING IN THIS NOTEBOOK!!

import sys
import os

# Dynamically add the 'src' directory to the Python path
sys.path.append(os.path.join(os.getcwd(), 'src'))
!pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
# This RAG Chatbot can answer any questions you have regarding on 337 video games considered the best in human history
# Let's first grab these video games info in a CSV table for your reference later
# The function below produce the cleaned CSV file with publisher HQ country
from get_table_HQ import get_best_games_table

get_best_games_table()

Scraping HQ addresses: 100%|██████████████████| 337/337 [00:32<00:00, 10.33it/s]

Saved 337 rows to best_video_games.csv





In [6]:
# Now that you have all 337 games info, you can come back to this CSV file later when you begin asking questions to this chatbot
# let's start building the Retrieval-Augmented Generation chatbot!
# The first step is to extract the Wikipedia text descriptions of all 337 games (around 290k words)

from hyperlink_scraper import extract_game_links_and_genres, save_to_txt
games = extract_game_links_and_genres(limit=337)
save_to_txt(games, output_file="game.txt")

Scraping games: 100%|███████████████████████| 337/337 [00:49<00:00,  6.76game/s]


In [None]:
# Now we have the game.txt, lets split it into chunks(700 words each chunk, with 100 words overlap with each other),
# extract keywords/tags of each chunk, store them in local sqlite database, embed them into vectors, store vectors in FAISS index file

In [5]:
from rag import (
    detect_file_type,
    extract_text,
    split_text_into_chunks,
    generate_chinese_tags,
    create_db,
    file_to_db,
    load_all_from_db,
    build_faiss_index,
    load_vector_storage,
    search_similar_chunks,
    answer_question_with_prompt
)

In [8]:
# This step reads the game.txt file, splits the content into chunks, and extracts keywords/tags using KeyBERT
# Since this function uses KeyBERT, it may take around 30 minutes to complete
# It will be faster if you choose to use other keyword extraction tools, but KeyBERT is very accurate and suitable for multilingual tasks

file_to_db('game.txt')

Processing file: game.txt
Text split into 19018 chunks


Extracting tags: 100%|████████████████████| 19018/19018 [27:30<00:00, 11.52it/s]
Saving to DB: 100%|███████████████████| 19018/19018 [00:00<00:00, 409287.52it/s]


Mission Complete! Data has been inserted with 19018 chunks


In [6]:
# YOU MUST HAVE YOUR OWN VALID OPENAI API KEY IN config.py TO RUN THIS step!

# Now we can begin to embed all contents in our database into vectors, and store them in FAISS index file
# This will take around 2 minutes to complete
docs = load_all_from_db()
build_faiss_index(docs)

Building document list: 100%|█████████| 19018/19018 [00:00<00:00, 107086.98it/s]


Vectorizing contents...
Vector index has been saved to vector_index


In [None]:
# We’re all set to start chatting with the RAG chatbot!
# Make sure you’ve added your own valid OpenAI API key in config.py before running this.

# This project uses a Streamlit interface where you can type in any question
# related to the 337 games listed in the HTML table.

# You can use the best_video_games.csv we generated earlier as a helpful reference
# while asking your questions.

# Running this block will open a new browser tab.
# In that new tab, you can interact with the chatbot through the Streamlit UI.

# If you’re not sure what to ask, just check out the best_video_games.csv for ideas!


import subprocess
import os

subprocess.run(
    "echo '' | streamlit run src/app.py",
    shell = True
)


#!!! If this section doesn’t work for you !!!###
# (1) Open your terminal
# (2) Move to the src folder where app.py located at
# (3) Enter: streamlit run app.py in your terminal
# Then it should be working!


# This is the end of my RAG Chatbot Project
# Thank you very much for testing my codes!
# Project Creator: Sim Wang


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8501
  Network URL: http://192.168.1.69:8501




>> from langchain.vectorstores import FAISS

with new imports of:

>> from langchain_community.vectorstores import FAISS
You can use the langchain cli to **automatically** upgrade many imports. Please see documentation here <https://python.langchain.com/docs/versions/v0_2/>
  from langchain.vectorstores import FAISS
