#### **Read the JSON file as a Dataframe**

In [17]:
df = spark.read.option("multiline", "true").json("Files/bing-news.json")
# df now is a Spark DataFrame containing JSON data from "Files/bing-news.json".
display(df)

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 19, Finished, Available)

SynapseWidget(Synapse.DataFrame, 08874fbe-dcd6-44d2-ba60-b1407dbc5a96)

#### **Selecting just the value column from the dataframe - include all JSON data we need**

In [18]:
df = df.select("value")
display(df)

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 20, Finished, Available)

SynapseWidget(Synapse.DataFrame, 810900f3-20d0-4f33-b500-ad4f9e7124e9)

#### **Explode the "value" column from the single row structure into multiple row structure**

In [19]:
from pyspark.sql.functions import explode

df_exploded = df.select(explode(df["value"]).alias("json_object"))
display(df_exploded)

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 21, Finished, Available)

SynapseWidget(Synapse.DataFrame, 841867e9-369e-441c-8026-5d32108d13be)

#### **Converting the exploded JSON dataframe to a single JSON string list**

In [20]:
json_list = df_exploded.toJSON().collect()

# to see the json structure of all the news articles from the list
print(json_list)

# to see the json structure of one news article from the list
#print(json_list[0])

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 22, Finished, Available)

['{"json_object":{"about":[{"name":"Sadiq Khan","readLink":"https://api.bing.microsoft.com/api/v7/entities/d7862dc2-4c03-1dc8-3412-7fe03041fdcb"},{"name":"Lee Anderson","readLink":"https://api.bing.microsoft.com/api/v7/entities/c94f2fa6-9853-e53f-af90-7aec244e4382"},{"name":"Politics","readLink":"https://api.bing.microsoft.com/api/v7/entities/b8f26db0-d3b7-4fb5-fbb0-9ea469754434"}],"datePublished":"2024-02-27T16:00:38.0000000Z","description":"Lee Anderson has insisted he will stand at the next general election but said whether that will be as a Tory candidate is “out of my hands”. The former deputy chairman of the Conservative Party lost the Tory whip after failing to apologise for claiming “Islamists” had “got control” of Sadiq Khan, the Mayor of London.","image":{"thumbnail":{"contentUrl":"https://www.bing.com/th?id=OVFT._aOhpc7Ui5gGTOhtmWMfTC&pid=News","height":366,"width":700}},"mentions":[{"name":"Sadiq Khan"},{"name":"Lee Anderson"},{"name":"Politics"}],"name":"Politics latest ne

#### **To work with information its really easy to convert JSON string list to a JSON dictionary. This is how to do it using json.loads()**

In [21]:
import json

article = json.loads(json_list[0])

print(article)

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 23, Finished, Available)

{'json_object': {'about': [{'name': 'Sadiq Khan', 'readLink': 'https://api.bing.microsoft.com/api/v7/entities/d7862dc2-4c03-1dc8-3412-7fe03041fdcb'}, {'name': 'Lee Anderson', 'readLink': 'https://api.bing.microsoft.com/api/v7/entities/c94f2fa6-9853-e53f-af90-7aec244e4382'}, {'name': 'Politics', 'readLink': 'https://api.bing.microsoft.com/api/v7/entities/b8f26db0-d3b7-4fb5-fbb0-9ea469754434'}], 'datePublished': '2024-02-27T16:00:38.0000000Z', 'description': 'Lee Anderson has insisted he will stand at the next general election but said whether that will be as a Tory candidate is “out of my hands”. The former deputy chairman of the Conservative Party lost the Tory whip after failing to apologise for claiming “Islamists” had “got control” of Sadiq Khan, the Mayor of London.', 'image': {'thumbnail': {'contentUrl': 'https://www.bing.com/th?id=OVFT._aOhpc7Ui5gGTOhtmWMfTC&pid=News', 'height': 366, 'width': 700}}, 'mentions': [{'name': 'Sadiq Khan'}, {'name': 'Lee Anderson'}, {'name': 'Politics

#### **To get information from certain elements ex: "description"**

In [22]:
print(article['json_object']['description'])

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 24, Finished, Available)

Lee Anderson has insisted he will stand at the next general election but said whether that will be as a Tory candidate is “out of my hands”. The former deputy chairman of the Conservative Party lost the Tory whip after failing to apologise for claiming “Islamists” had “got control” of Sadiq Khan, the Mayor of London.


#### **Lets get more information from the JSON Dictionary**

In [23]:
# name
# description
# url
# image
# provider
# datePublished

# Can use :"Online JSON Parser" to see the structure of the JSON string - copy & paste the full string to the online tool.

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 25, Finished, Available)

In [24]:
print(article['json_object']['name'])
print(article['json_object']['description'])
print(article['json_object']['url'])
print(article['json_object']['image']["thumbnail"]["contentUrl"])
print(article['json_object']['provider'][0]['name'])
print(article['json_object']['datePublished'])

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 26, Finished, Available)

Politics latest news: Ex-minister apologises after claims of religious ‘no-go’ areas
Lee Anderson has insisted he will stand at the next general election but said whether that will be as a Tory candidate is “out of my hands”. The former deputy chairman of the Conservative Party lost the Tory whip after failing to apologise for claiming “Islamists” had “got control” of Sadiq Khan, the Mayor of London.
https://www.msn.com/en-us/news/world/politics-latest-news-ex-minister-apologises-after-claims-of-religious-no-go-areas/ar-BB1iYvVc
https://www.bing.com/th?id=OVFT._aOhpc7Ui5gGTOhtmWMfTC&pid=News
The Daily Telegraph on MSN.com
2024-02-27T16:00:38.0000000Z


#### **Now, will loop through all the JSON dictionary lists and get the data**

In [34]:
# Initialise empty lists
name = []
description = []
url = []
image = []
provider = []
datePublished = []

#process each JSON object in the list
for json_str in json_list:
    try:
        # Parse the JSON string into a dictionary
        article = json.loads(json_str)

        # This is optional - There might be data inconsistencies from the data comes from Bing API, in that case if some articles missing some data, we can write this if condition to check
        if article["json_object"].get("name") and article["json_object"].get("image", {}).get("thumbnail", {}).get("contentUrl"):

            # Extract information from the dictionary
            name.append(article["json_object"]["name"])
            description.append(article['json_object']['description'])
            url.append(article['json_object']['url'])
            image.append(article['json_object']['image']["thumbnail"]["contentUrl"])
            provider.append(article['json_object']['provider'][0]['name'])
            datePublished.append(article['json_object']['datePublished'])
    
    except Exception as e:
        print(f"Error processing JSON object: {e}")

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 36, Finished, Available)

In [37]:
url

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 39, Finished, Available)

['https://www.msn.com/en-us/news/world/politics-latest-news-ex-minister-apologises-after-claims-of-religious-no-go-areas/ar-BB1iYvVc',
 'https://www.nfl.com/news/nfl-news-roundup-latest-league-updates-from-tuesday-feb-27',
 'https://www.msn.com/en-us/sports/nfl/chargers-news-latest-on-free-agent-austin-ekeler-s-status-with-la/ar-BB1j0iIM',
 'https://www.msn.com/en-us/sports/tennis/wales-v-france-six-nations-2024-kick-off-time-how-to-watch-and-latest-news/ar-BB1iXHYf',
 'https://www.msn.com/en-ae/news/other/will-your-iphone-support-ios-18-the-latest-rumors-could-bring-good-news/ar-BB1iZRYz',
 'https://www.msn.com/en-us/money/technology/google-is-paying-news-outlets-to-unleash-an-avalanche-of-ai-slop/ar-BB1j08dZ',
 'https://www.msn.com/en-sg/sport/football/liverpool-injury-update-wataru-endo-mohamed-salah-and-ryan-gravenberch-latest-news-and-return-dates/ar-BB1iTy12',
 'https://www.msn.com/en-us/money/topstocks/tesla-stock-rises-here-s-the-latest-ev-news/ar-BB1iY6uG',
 'https://www.mirro

#### **Combine all the lists together and create dataframe with a defined schema**

In [38]:
from pyspark.sql.types import StructType, StructField, StringType

# Combine the lists
data = list(zip(name,description,url,image,provider,datePublished))

# Define Schema
schema = StructType([
        StructField("name", StringType(), True),
        StructField("description", StringType(), True),
        StructField("url", StringType(), True),
        StructField("image", StringType(), True),
        StructField("provider", StringType(), True),
        StructField("datePublished", StringType(), True)
])

# Create Dataframe
df_cleaned = spark.createDataFrame(data, schema=schema)

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 40, Finished, Available)

In [39]:
display(df_cleaned)

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 41, Finished, Available)

SynapseWidget(Synapse.DataFrame, e0ae906a-ae6b-426b-b2b9-45eedaaaaa90)

#### **Transform "datepublished" column data from timestamp to date data type**

In [40]:
from pyspark.sql.functions import to_date, date_format

df_cleaned_final = df_cleaned.withColumn("datePublished", date_format(to_date("datePublished"), "dd-MM-yyyy"))

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 42, Finished, Available)

In [42]:
display(df_cleaned_final)
# display(df_cleaned_final.limit(5))

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 44, Finished, Available)

SynapseWidget(Synapse.DataFrame, ee0c258f-913e-4983-9c70-d487bb881eda)

#### **Writing the final dataframe to the lakehouse db in Delta format**

In [43]:
df_cleaned_final.write.format("delta").saveAsTable("bing_lake_db.tbl_latest_news")

StatementMeta(, 0443f73a-7332-47bc-bc2a-876b08a2edd5, 45, Finished, Available)