diff --git a/.github/workflows/run_tests.yml b/.github/workflows/run_tests.yml
index 07062257a3..06c1c6f6ea 100644
--- a/.github/workflows/run_tests.yml
+++ b/.github/workflows/run_tests.yml
@@ -26,7 +26,6 @@ jobs:
         with:
           args: --fix-only --exit-non-zero-on-fix
         continue-on-error: true
-
       - name: Fail Workflow if Ruff Fix Failed
         if: steps.ruff_fix.outcome == 'failure'
         run: |
@@ -120,3 +119,28 @@ jobs:
           cache: "poetry"
       - name: Run setup.py build
         run: python setup.py build
+
+  integration_test:
+    name: Run Integration Tests
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.9"]
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+          cache: "pip"
+      - run: pip install -r requirements.txt
+      - run: pip install -r requirements-dev.txt
+      - run: pip install openai==0.28.1
+      - name: Add directory to Python path
+        run: echo "PYTHONPATH=$(python -c 'import sys; print(":".join(sys.path))'):$(pwd)" >> $GITHUB_ENV
+      - name: Set up cache directory
+        run: |
+          cd ./cache
+          echo "DSP_NOTEBOOK_CACHEDIR=$(pwd)" >> $GITHUB_ENV
+      - name: Run tests with pytest
+        run: pytest -c tests_integration/pytest.ini tests_integration/
diff --git a/pyproject.toml b/pyproject.toml
index 7557fdce05..8bf893bdfc 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -284,6 +284,8 @@ ignore = [
     "E731",
     # Sometimes we need List and Tuple
     "UP006",
+    # Ignore assert
+    "S101",
 ]
 
 # Allow fix for all enabled rules (when `--fix`) is provided.
diff --git a/requirements-dev.txt b/requirements-dev.txt
new file mode 100644
index 0000000000..d769cde648
--- /dev/null
+++ b/requirements-dev.txt
@@ -0,0 +1,8 @@
+black==24.2.0
+pre-commit==3.7.0
+pytest==8.2.1
+pytest-env==1.1.3
+pytest-mock==3.12.0
+ruff==0.3.0
+torch==2.2.1
+transformers==4.38.2
diff --git a/tests/pytest.ini b/tests_integration/pytest.ini
similarity index 58%
rename from tests/pytest.ini
rename to tests_integration/pytest.ini
index c24fe5bb9e..250e022ee5 100644
--- a/tests/pytest.ini
+++ b/tests_integration/pytest.ini
@@ -1,3 +1,6 @@
 [pytest]
 filterwarnings =
     ignore::DeprecationWarning
+
+env =
+    DSP_NOTEBOOK_CACHEDIR=./cache
diff --git a/tests_integration/test_intro.py b/tests_integration/test_intro.py
new file mode 100644
index 0000000000..218f5ebac9
--- /dev/null
+++ b/tests_integration/test_intro.py
@@ -0,0 +1,357 @@
+from typing import Any
+
+import dspy
+
+
+class TestIntroIntegration:
+    def test_dspy_workflow(self) -> None:
+        self.setup_dspy()
+
+        dev_example, dev_set, training_set = self.assert_dataset_loading()
+
+        self.assert_basic_qa(dev_example)
+
+        self.assert_retrieval(dev_example)
+
+        self.assert_compilation(dev_set, training_set)
+
+    def assert_compilation(self, devset, trainset) -> None:
+        class GenerateAnswer(dspy.Signature):
+            """Answer questions with short factoid answers."""
+
+            context = dspy.InputField(desc="may contain relevant facts")
+            question = dspy.InputField()
+            answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+        class RAG(dspy.Module):
+            def __init__(self, num_passages=3):
+                super().__init__()
+
+                self.retrieve = dspy.Retrieve(k=num_passages)
+                self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+
+            def forward(self, question):
+                context = self.retrieve(question).passages
+                prediction = self.generate_answer(context=context, question=question)
+                return dspy.Prediction(context=context, answer=prediction.answer)
+
+        from dspy.teleprompt import BootstrapFewShot
+
+        # Validation logic: check that the predicted answer is correct.
+        # Also check that the retrieved context actually contains that answer.
+        def validate_context_and_answer(example, pred, trace=None):  # noqa
+            answer_em = dspy.evaluate.answer_exact_match(example, pred)
+            answer_pm = dspy.evaluate.answer_passage_match(example, pred)
+            return answer_em and answer_pm
+
+        # Set up a basic teleprompter, which will compile our RAG program.
+        teleprompter = BootstrapFewShot(metric=validate_context_and_answer)
+        # Compile the RAG model
+        compiled_rag = teleprompter.compile(RAG(), trainset=trainset)
+
+        # Test the compiled RAG model with a question
+        my_question = "What castle did David Gregory inherit?"
+        pred = compiled_rag(my_question)
+
+        # Assertions to verify the compiled RAG model
+        assert f"Question: {my_question}" == "Question: What castle did David Gregory inherit?"
+        assert f"Predicted Answer: {pred.answer}" == "Predicted Answer: Kinnairdy Castle"
+        assert (
+            f"Retrieved Contexts (truncated): {[c[:10] + '...' for c in pred.context]}"
+            == "Retrieved Contexts (truncated): ['David Greg...', 'Gregory Ta...', 'David Greg...']"
+        )
+
+        # Verify compiled model's parameters
+        named_predictors_dict = {}
+        for name, parameter in compiled_rag.named_predictors():
+            named_predictors_dict[name] = {
+                "context": parameter.demos[0].context,
+                "answer": parameter.demos[0].answer,
+                "question": parameter.demos[0].question,
+                "rationale": parameter.demos[0].rationale,
+            }
+
+        assert named_predictors_dict == {
+            "generate_answer": {
+                "context": [
+                    "Tae Kwon Do Times | Tae Kwon Do Times is a magazine devoted to the martial art of "
+                    "taekwondo, and is published in the United States of America. While the title "
+                    "suggests that it focuses on taekwondo exclusively, the magazine also covers other "
+                    'Korean martial arts. "Tae Kwon Do Times" has published articles by a wide range of '
+                    "authors, including He-Young Kimm, Thomas Kurz, Scott Shaw, and Mark Van "
+                    "Schuyver.",
+                    "Kwon Tae-man | Kwon Tae-man (born 1941) was an early Korean hapkido "
+                    "practitioner and a pioneer of the art, first in Korea and then in the "
+                    "United States. He formed one of the earliest dojang's for hapkido in "
+                    "the United States in Torrance, California, and has been featured in "
+                    "many magazine articles promoting the art.",
+                    "Hee Il Cho | Cho Hee Il "
+                    "(born October 13, "
+                    "1940) is a prominent "
+                    "Korean-American master "
+                    "of taekwondo, "
+                    "holding the rank of 9th "
+                    '"dan" in the martial '
+                    "art. He has written 11 "
+                    "martial art books, "
+                    "produced 70 martial art "
+                    "training videos, "
+                    "and has appeared on more "
+                    "than 70 martial arts "
+                    "magazine covers. Cho won "
+                    "several national and "
+                    "international "
+                    "competitions as a "
+                    "taekwondo competitor, "
+                    "and has appeared in "
+                    "several films, "
+                    'including "Fight to '
+                    'Win", "Best of the '
+                    'Best", "Bloodsport II", '
+                    'and "Bloodsport III". He '
+                    "founded the Action "
+                    "International Martial "
+                    "Arts Association (AIMAA) "
+                    "in 1980, and is its "
+                    "President. Cho is a "
+                    'member of both "Black '
+                    "Belt\" magazine's Hall "
+                    'of Fame and "Tae Kwon Do '
+                    "Times\" magazine's Hall "
+                    "of Fame.",
+                ],
+                "answer": "Tae Kwon Do Times",
+                "question": "Which magazine has published articles by Scott "
+                "Shaw, Tae Kwon Do Times or Southwest Art?",
+                "rationale": 'produce the answer. We know from the context that "Tae Kwon Do Times" is a '
+                "magazine that covers taekwondo and other Korean martial arts. It has published "
+                "articles by authors like Scott Shaw. On the other hand, there is no information "
+                "about Southwest Art magazine in the context.",
+            },
+        }
+
+        from dspy.evaluate.evaluate import Evaluate
+
+        # Set up the evaluation function
+        evaluate_on_hotpotqa = Evaluate(devset=devset, num_threads=1, display_progress=True, display_table=5)
+        # Evaluate the compiled RAG program with the exact match metric
+        metric = dspy.evaluate.answer_exact_match
+        evaluate_on_hotpotqa(compiled_rag, metric=metric)
+
+        def gold_passages_retrieved(example, pred, trace=None):  # noqa
+            gold_titles = set(map(dspy.evaluate.normalize_text, example["gold_titles"]))
+            found_titles = set(map(dspy.evaluate.normalize_text, [c.split(" | ")[0] for c in pred.context]))
+            return gold_titles.issubset(found_titles)
+
+        compiled_rag_retrieval_score = evaluate_on_hotpotqa(compiled_rag, metric=gold_passages_retrieved)
+
+        class GenerateSearchQuery(dspy.Signature):
+            """Write a simple search query that will help answer a complex question."""
+
+            context = dspy.InputField(desc="may contain relevant facts")
+            question = dspy.InputField()
+            query = dspy.OutputField()
+
+        from dsp.utils import deduplicate
+
+        class SimplifiedBaleen(dspy.Module):
+            def __init__(self, passages_per_hop=3, max_hops=2):
+                super().__init__()
+
+                self.generate_query = [dspy.ChainOfThought(GenerateSearchQuery) for _ in range(max_hops)]
+                self.retrieve = dspy.Retrieve(k=passages_per_hop)
+                self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
+                self.max_hops = max_hops
+
+            def forward(self, question):
+                context = []
+
+                for hop in range(self.max_hops):
+                    query = self.generate_query[hop](context=context, question=question).query
+                    passages = self.retrieve(query).passages
+                    context = deduplicate(context + passages)
+
+                pred = self.generate_answer(context=context, question=question)
+                return dspy.Prediction(context=context, answer=pred.answer)
+
+        # Test the SimplifiedBaleen model with a question
+        my_question = "How many storeys are in the castle that David Gregory inherited?"
+        uncompiled_baleen = SimplifiedBaleen()
+        pred = uncompiled_baleen(my_question)
+
+        # Assertions to verify the SimplifiedBaleen model
+        assert (
+            f"Question: {my_question}" == "Question: How many storeys are in the castle that David Gregory inherited?"
+        )
+        assert f"Predicted Answer: {pred.answer}" == "Predicted Answer: five"
+
+        # Retrieved Contexts (truncated)
+        assert [c[:10] for c in pred.context] == [
+            "David Greg",
+            "The Boleyn",
+            "Gregory of",
+            "Kinnairdy ",
+            "Kinnaird H",
+            "Kinnaird C",
+        ]
+
+        def validate_context_and_answer_and_hops(example, pred, trace=None):
+            if not dspy.evaluate.answer_exact_match(example, pred):
+                return False
+            if not dspy.evaluate.answer_passage_match(example, pred):
+                return False
+
+            hops = [example.question] + [outputs.query for *_, outputs in trace if "query" in outputs]
+
+            if max([len(h) for h in hops]) > 100:
+                return False
+            if any(
+                dspy.evaluate.answer_exact_match_str(hops[idx], hops[:idx], frac=0.8) for idx in range(2, len(hops))
+            ):
+                return False
+
+            return True
+
+        teleprompter = BootstrapFewShot(metric=validate_context_and_answer_and_hops)
+        compiled_baleen = teleprompter.compile(
+            SimplifiedBaleen(),
+            teacher=SimplifiedBaleen(passages_per_hop=2),
+            trainset=trainset,
+        )
+        uncompiled_baleen_retrieval_score = evaluate_on_hotpotqa(uncompiled_baleen, metric=gold_passages_retrieved)
+        compiled_baleen_retrieval_score = evaluate_on_hotpotqa(compiled_baleen, metric=gold_passages_retrieved)
+
+        # Assertions for the retrieval scores
+        assert f"## Retrieval Score for RAG: {compiled_rag_retrieval_score}" == "## Retrieval Score for RAG: 26.0"
+        assert (
+            f"## Retrieval Score for uncompiled Baleen: {uncompiled_baleen_retrieval_score}"
+            == "## Retrieval Score for uncompiled Baleen: 36.0"
+        )
+        assert (
+            f"## Retrieval Score for compiled Baleen: {compiled_baleen_retrieval_score}"
+            == "## Retrieval Score for compiled Baleen: 60.0"
+        )
+        assert compiled_baleen("How many storeys are in the castle that David Gregory inherited?") is not None
+
+    def assert_retrieval(self, dev_example) -> None:
+        retrieve = dspy.Retrieve(k=3)
+        top_k_passages = retrieve(dev_example.question).passages
+
+        # Assertions to verify the retrieval functionality
+        assert retrieve.k == 3
+        assert (
+            dev_example.question
+            == "What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?"
+        )
+        assert (
+            top_k_passages[0]
+            == "Restaurant: Impossible | Restaurant: Impossible is an American reality television series, featuring "
+            "chef and restaurateur Robert Irvine, that aired on Food Network from 2011 to 2016."
+        )
+        assert (
+            top_k_passages[1]
+            == "Jean Joho | Jean Joho is a French-American chef and restaurateur. He is chef/proprietor of Everest in "
+            "Chicago (founded in 1986), Paris Club Bistro & Bar and Studio Paris in Chicago, The Eiffel Tower "
+            "Restaurant in Las Vegas, and Brasserie JO in Boston."
+        )
+        assert top_k_passages[2] == (
+            "List of Restaurant: Impossible episodes | This is the list of the episodes for "
+            'the American cooking and reality television series "Restaurant Impossible", '
+            "produced by Food Network. The premise of the series is that within two days and "
+            "on a budget of $10,000, celebrity chef Robert Irvine renovates a failing "
+            "American restaurant with the goal of helping to restore it to profitability and "
+            "prominence. Irvine is assisted by a designer (usually Taniya Nayak, "
+            "Cheryl Torrenueva, or Lynn Keagan, but sometimes Vanessa De Leon, "
+            "Krista Watterworth, Yvette Irene, or Nicole Faccuito), along with general "
+            "contractor Tom Bury, who sometimes does double duty as both general contractor "
+            "and designer. After assessing the problems with the restaurant, Robert Irvine "
+            "typically creates a plan for the new decor, oversees the cleaning of the "
+            "restaurant, reduces the size of the menu and improves the food, develops a "
+            "promotional activity, educates the restaurant's owners, or trains the staff, "
+            "as needed by each restaurant."
+        )
+
+        retrieved_value = retrieve("When was the first FIFA World Cup held?").passages[0]
+        assert retrieved_value[:30] == "History of the FIFA World Cup "
+
+    def assert_basic_qa(self, dev_example) -> None:
+        class BasicQA(dspy.Signature):
+            """Answer questions with short factoid answers."""
+
+            question = dspy.InputField()
+            answer = dspy.OutputField(desc="often between 1 and 5 words")
+
+        # Define the predictor
+        generate_answer = dspy.Predict(BasicQA)
+        # Call the predictor on a particular input
+        pred = generate_answer(question=dev_example.question)
+
+        # Assertions to verify the basic QA functionality
+        assert (
+            f"Question: {dev_example.question}"
+            == "Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?"
+        )
+        assert f"Predicted Answer: {pred.answer}" == "Predicted Answer: American"
+        assert (
+            self.turbo.inspect_history().strip()
+            == "Answer questions with short factoid answers.\n\n---\n\nFollow the following format.\n\nQuestion: ${"
+            "question}\nAnswer: often between 1 and 5 words\n\n---\n\nQuestion: What is the nationality of the "
+            "chef and restaurateur featured in Restaurant: Impossible?\nAnswer:\x1b[32m American\x1b[0m"
+        )
+        # Define the predictor with chain of thought
+        generate_answer_with_chain_of_thought = dspy.ChainOfThought(BasicQA)
+        # Call the predictor on the same input
+        pred = generate_answer_with_chain_of_thought(question=dev_example.question)
+
+        # Assertions to verify the chain of thought functionality
+        assert (
+            f"Question: {dev_example.question}"
+            == "Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?"
+        )
+        assert (
+            f"Thought: {pred.rationale.split('.', 1)[1].strip()}"
+            == "Thought: We know that the chef and restaurateur featured in Restaurant: Impossible is Robert Irvine."
+        )
+        assert f"Predicted Answer: {pred.answer}" == "Predicted Answer: British"
+
+    def assert_dataset_loading(self) -> tuple:
+        from dspy.datasets import HotPotQA
+
+        # Load the dataset
+        dataset = HotPotQA(train_seed=1, train_size=20, eval_seed=2023, dev_size=50, test_size=0)
+
+        # Prepare the datasets for training and development
+        trainset = [x.with_inputs("question") for x in dataset.train]
+        devset = [x.with_inputs("question") for x in dataset.dev]
+        train_example = trainset[0]
+
+        # Assertions to verify the dataset loading
+        assert (
+            f"Question: {train_example.question}"
+            == "Question: At My Window was released by which American singer-songwriter?"
+        )
+        assert f"Answer: {train_example.answer}" == "Answer: John Townes Van Zandt"
+        dev_example = devset[18]
+        assert (
+            f"Question: {dev_example.question}"
+            == "Question: What is the nationality of the chef and restaurateur featured in Restaurant: Impossible?"
+        )
+        assert f"Answer: {dev_example.answer}" == "Answer: English"
+        assert "Restaurant: Impossible" in list(dev_example.gold_titles)
+        assert "Robert Irvine" in list(dev_example.gold_titles)
+        assert (
+            f"For this dataset, training examples have input keys {train_example.inputs().keys()} and label keys "
+            f"{train_example.labels().keys()}"
+            == "For this dataset, training examples have input keys ['question'] and label keys ['answer']"
+        )
+        assert (
+            f"For this dataset, dev examples have input keys {dev_example.inputs().keys()} and label keys "
+            f"{dev_example.labels().keys()}"
+            == "For this dataset, dev examples have input keys ['question'] and label keys ['answer', 'gold_titles']"
+        )
+        return dev_example, devset, trainset
+
+    def setup_dspy(self) -> Any:
+        self.turbo = dspy.OpenAI(model="gpt-3.5-turbo")
+        colbertv2_wiki17_abstracts = dspy.ColBERTv2(url="http://20.102.90.50:2017/wiki17_abstracts")
+        dspy.settings.configure(lm=self.turbo, rm=colbertv2_wiki17_abstracts)