In [1]:
import openai
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

openai.api_base = "https://openai.vocareum.com/v1"
openai.api_key = os.getenv("VOC_OPENAI_API_KEY")

In [2]:
import requests

# Get the Wikipedia page for the 2023 Turkey–Syria earthquake
params = {
    "action": "query", 
    "prop": "extracts",
    "exlimit": 1,
    "titles": "2023_Turkey–Syria_earthquakes",
    "explaintext": 1,
    "formatversion": 2,
    "format": "json"
}

# Add a User-Agent header to identify your application
headers = {
    "User-Agent": "LearningProject/1.0 (Educational purposes; your-email@example.com)"
}

resp = requests.get("https://en.wikipedia.org/w/api.php", params=params)
resp = requests.get("https://en.wikipedia.org/w/api.php", params=params, headers=headers)
response_dict = resp.json()
response_dict

{'batchcomplete': True,
 'query': {'normalized': [{'fromencoded': False,
    'from': '2023_Turkey–Syria_earthquakes',
    'to': '2023 Turkey–Syria earthquakes'}],
  'pages': [{'pageid': 72956318,
    'ns': 0,
    'title': '2023 Turkey–Syria earthquakes',

In [7]:
text_data = response_dict['query']['pages'][0]['extract'].split('\n')

In [8]:
import pandas as pd

# Load page text into a dataframe
df = pd.DataFrame()
df["text"] = text_data

# Clean up dataframe to remove empty lines and headings
df = df[(
    (df["text"].str.len() > 0) & (~df["text"].str.startswith("=="))
)].reset_index(drop=True)
df.head()

Unnamed: 0,text
0,"On 6 February 2023, at 04:17:35 TRT (01:17:35 ..."
1,The Mw 7.8 earthquake is the largest to strike...
2,"Damaged roads, winter storms, and disruption t..."
3,There was widespread damage in an area of abou...
4,"The confirmed death toll in Turkey was 53,537;..."


In [9]:
df.shape

(114, 1)

In [13]:
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
from openai import OpenAI

client = OpenAI(
    api_key=os.getenv("VOC_OPENAI_API_KEY"),
    base_url="https://openai.vocareum.com/v1"
)

response = client.embeddings.create(
    input=[df["text"][0]],
    model=EMBEDDING_MODEL_NAME
)

# Extract and print the first 20 numbers in the embedding
first_item_embedding = response.data[0].embedding
print(f"Embedding dimension: {len(first_item_embedding)}")
print(f"First 20 values: {first_item_embedding[:20]}")

Embedding dimension: 1536
First 20 values: [-0.00921398401260376, -0.022343911230564117, -0.011348105035722256, -0.031083645299077034, 0.005071078427135944, 0.024837106466293335, -0.034715037792921066, -0.011720729060471058, -0.00036436732625588775, -0.03143594413995743, 0.019105466082692146, 0.054172806441783905, -0.01566377282142639, -0.022140661254525185, 0.01348900143057108, -0.007865761406719685, 0.010508006438612938, -0.017357518896460533, 0.006859675515443087, -0.00728988740593195]


In [16]:
response = client.embeddings.create(
    input=df["text"].tolist(),
    model=EMBEDDING_MODEL_NAME
)

# Extract embeddings using the new API structure
embeddings = [data.embedding for data in response.data]

print(f"Total embeddings created: {len(embeddings)}")
print(f"Embedding dimension: {len(embeddings[0])}")

Total embeddings created: 114
Embedding dimension: 1536


In [17]:
# Add embeddings list to dataframe
df["embeddings"] = embeddings
df.to_csv("embeddings.csv")

In [18]:
df.sample(5)

Unnamed: 0,text,embeddings
113,"""Kahramanmaraş Supersite science page"". Group ...","[0.0036635908763855696, -0.007788894232362509,..."
2,"Damaged roads, winter storms, and disruption t...","[-0.01818825490772724, -0.014495408162474632, ..."
30,"According to Kandilli Observatory, the maximum...","[-0.002625927561894059, -0.0055386340245604515..."
24,The second M>7 earthquake initiated on a separ...,"[-0.011603017337620258, -0.007839692756533623,..."
19,The USGS source model for the Mw 7.7 earthquak...,"[-0.0024659144692122936, -0.00112019176594913,..."
