<br>

<h1 style="text-align:center;"> Step 1: Preparing a Dataset with Embeddings </h1>

<br>

In [1]:
# Import the libraries
import pandas as pd
import openai
import requests

In [2]:
# OpenAI API Key
openai.api_key = "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"

<br>

## Load the Data

---

In [3]:
# Fetch Wikipedia page regarding "2023 Turkey–Syria earthquake"
resp = requests.get("https://en.wikipedia.org/w/api.php", 
                    params={"action": "query", 
                            "prop": "extracts",
                            "exlimit": 1,
                            "titles": "2023_Turkey–Syria_earthquake",
                            "explaintext": 1,
                            "formatversion": 2,
                            "format": "json"
                            })

# Convert to json
response_dict = resp.json()
response_dict

{'batchcomplete': True,
 'query': {'normalized': [{'fromencoded': False,
    'from': '2023_Turkey–Syria_earthquake',
    'to': '2023 Turkey–Syria earthquake'}],
  'pages': [{'pageid': 72956318,
    'ns': 0,
    'title': '2023 Turkey–Syria earthquake',

In [4]:
# Parse the response
text_data = response_dict["query"]["pages"][0]["extract"].split("\n")
text_data[:4]

['On 6 February 2023, at 04:17 TRT (01:17 UTC), a Mw 7.8 earthquake struck southern and central Turkey and northern and western Syria. The epicenter was 37 km (23 mi) west–northwest of Gaziantep. The earthquake had a maximum Mercalli intensity of XII (Extreme) in parts of Antakya in Hatay Province. It was followed by a Mw\u202f7.7 earthquake at 13:24. This earthquake was centered 95 km (59 mi) north-northeast from the first. There was widespread damage and tens of thousands of fatalities.',
 'The Mw\u202f7.8 earthquake is the largest in Turkey since the 1939 Erzincan earthquake of the same magnitude, and jointly the second-strongest recorded in the history of the country, after the 1668 North Anatolia earthquake. It is also one of the strongest earthquakes ever recorded in the Levant. It was felt as far as Egypt and the Black Sea coast of Turkey. There were more than 10,000 aftershocks in the three weeks that followed. The seismic sequence was the result of shallow strike-slip faulting

In [5]:
# Convert to dataframe
df = pd.DataFrame(text_data, columns=["text"])


df = df[(df["text"].str.len() > 0) & (~df["text"].str.startswith("=="))].reset_index(drop=True)

df.head()

Unnamed: 0,text
0,"On 6 February 2023, at 04:17 TRT (01:17 UTC), ..."
1,The Mw 7.8 earthquake is the largest in Turkey...
2,There was widespread damage in an area of abou...
3,"Damaged roads, winter storms, and disruption t..."
4,Central southern Turkey and northwestern Syria...


In [6]:
# Data wrangling; Don't worry too much about the details here; data wrangling is different for every dataset!

from dateutil.parser import parse

# Remove empty lines and headings
df = df[(df["text"].str.len() > 0) & (~df["text"].str.startswith("=="))]

# In some cases dates are used as headings instead of being part of the text sample; adjust so dated text samples start with dates
prefix = ""
for (i, row) in df.iterrows():

    # If the row already has " - ", it already has the needed date prefix
    if " – " not in row["text"]:
        
        # If the row's text is a date, set it as the new prefix
        try:
            parse(row["text"])
            prefix = row["text"]

        # If the row's text isn't a date, add the prefix
        except:
            row["text"] = prefix + " – " + row["text"]

# Keep only rows that have " - " in them
df = df[df["text"].str.contains(" – ")].reset_index(drop=True)

df.head()


Unnamed: 0,text
0,"– On 6 February 2023, at 04:17 TRT (01:17 UTC..."
1,– The Mw 7.8 earthquake is the largest in Tur...
2,– There was widespread damage in an area of a...
3,"– Damaged roads, winter storms, and disruptio..."
4,– Central southern Turkey and northwestern Sy...


<br>

## Create the Embeddings Index

---


In [7]:
# Name of embedding model in OpenAI
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"

In [None]:
# Create a sample embeddings for first row using OpenAI
response = openai.Embedding.create(input=[df["text"][0]], engine=EMBEDDING_MODEL_NAME)
response

In [9]:
# Create embeddings for all rows using OpenAI
response = openai.Embedding.create(input=df["text"].tolist(), engine=EMBEDDING_MODEL_NAME)

# Extract embeddings
embeddings = [data["embedding"] for data in response["data"]]

# Add embeddings to dataframe
df["embedding"] = embeddings

df.head()

Unnamed: 0,text,embedding
0,"– On 6 February 2023, at 04:17 TRT (01:17 UTC...","[-0.005470085423439741, -0.01460933592170477, ..."
1,– The Mw 7.8 earthquake is the largest in Tur...,"[-0.00437286589294672, -0.02579004317522049, -..."
2,– There was widespread damage in an area of a...,"[-0.002312745898962021, -0.02046130783855915, ..."
3,"– Damaged roads, winter storms, and disruptio...","[-0.016192838549613953, -0.013547341339290142,..."
4,– Central southern Turkey and northwestern Sy...,"[-0.00781626533716917, -0.009998817928135395, ..."


In [10]:
# Save the dataframe
df.to_csv("embeddings.csv", index=False)