Import the libraries

In [13]:
import json
import os
from azure.identity import ClientSecretCredential
from azure.mgmt.resource import ResourceManagementClient
from azure.mgmt.databricks import AzureDatabricksManagementClient
from pyspark.sql import SparkSession
from mongomock import MongoClient
import mongomock
# from azure.mgmt.storage import StorageManagementClient
# from azure.mgmt.storage.models import StorageAccountCreateParameters, Sku, Kind

# from azure.storage.blob import BlobServiceClient
import uuid
import csv
from datetime import datetime

from databricks import sql
import subprocess

import requests
from requests.auth import HTTPBasicAuth
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql.functions import col, explode, desc, split, trim

In [2]:
import random
random_number = random.randint(1000, 9999)
workspace_name = f"demoworkspacesi{random_number}"
resource_group_name = "Test"
location = "eastus"
cluster_name = "my-databricks-cluster"
managed_resource_group_name = "test-rg"

Loading Azure Credentials

Executes the process of loading Azure credentials from a JSON file and returning an authenticated credential object.

This function reads the azure_credentials.json file (or a specified file path), extracts required authentication details (tenant_id, client_id, client_secret, and subscription_id), and validates their presence. It then authenticates using ClientSecretCredential, allowing secure access to Azure resources.

In [3]:
def load_azure_credentials(credentials_file: str = "azure_credentials.json"):
    """
    Loads Azure credentials from a JSON file and returns an authenticated credential object.

    Args:
        credentials_file (str): Path to the JSON file containing Azure credentials.

    Returns:
        tuple: A tuple containing:
            - ClientSecretCredential object for authentication
            - Subscription ID as a string
    """
    # Ensure the file exists
    if not os.path.exists(credentials_file):
        raise FileNotFoundError(f"The credentials file '{credentials_file}' does not exist.")
    
    # Load credentials from the file
    with open(credentials_file, "r") as file:
        creds = json.load(file)
    global tenant_id , client_id,client_secret,subscription_id
    # Extract required fields
    tenant_id = creds.get("tenant_id")
    client_id = creds.get("client_id")
    client_secret = creds.get("client_secret")
    subscription_id = creds.get("subscription_id")
    
    # Validate required fields
    if not all([tenant_id, client_id, client_secret, subscription_id]):
        raise ValueError("The credentials file is missing one or more required fields: 'tenant_id', 'client_id', 'client_secret', 'subscription_id'.")
    
    # Authenticate using ClientSecretCredential
    credential = ClientSecretCredential(tenant_id=tenant_id, client_id=client_id, client_secret=client_secret)
    
    return credential, subscription_id

In [4]:
# Example usage

credential, subscription_id = load_azure_credentials("azure_credentials.json")

Create Mongodb Resources

In [14]:
def create_Mongo_Resources():

    # Initialize mongomock client to simulate MongoDB
    mongo_client = mongomock.MongoClient()
    db = mongo_client["movie_database"]

    titles_collection = db["titles"]
    credits_collection = db["credits"]
    
    # Load Titles data
    titles_df = pd.read_csv("Titles.csv")
    titles_data = titles_df.to_dict("records")
    titles_collection.insert_many(titles_data)

    # Load Credits data
    credits_df = pd.read_csv("Credits.csv")
    credits_data = credits_df.to_dict("records")
    credits_collection.insert_many(credits_data)

      # Verify insertion by printing counts of documents in each collection
    print(f"Titles count: {titles_collection.count_documents({})}")
    print(f"Credits count: {credits_collection.count_documents({})}")

    return titles_collection,credits_collection

In [15]:
titles_collection,credits_collection = create_Mongo_Resources()

Titles count: 209
Credits count: 300


Data Extraction from Mongodb

In [16]:
def extract_data_from_mongodb(titles_collection,credits_collection):
    titles_df, credits_df = None,None
    # code starts here
    titles_df = pd.DataFrame(list(titles_collection.find({}, {"_id": 0,"id":1,"title":1,"type":1,"genres":1,"imdb_score":1,"tmdb_score":1})))
    credits_df = pd.DataFrame(list(credits_collection.find({}, {"_id": 0})))
    # code ends here
    return titles_df, credits_df

In [17]:
titles_df, credits_df = extract_data_from_mongodb(titles_collection,credits_collection)
print("Titles Data:")
print(titles_df.head(5))
print("\nCredits Data:")
print(credits_df.head(5))

Titles Data:
         id                                title   type  \
0  ts300399  Five Came Back: The Reference Films   SHOW   
1   tm84618                          Taxi Driver  MOVIE   
2  tm154986                          Deliverance  MOVIE   
3  tm127384      Monty Python and the Holy Grail  MOVIE   
4  tm120801                      The Dirty Dozen  MOVIE   

                                 genres  imdb_score  tmdb_score  
0                         documentation         NaN         NaN  
1                         drama , crime         8.2       8.179  
2  drama , action , thriller , european         7.7       7.300  
3             fantasy , action , comedy         8.2       7.811  
4                          war , action         7.7       7.600  

Credits Data:
   person_id       id             name                character   role
0       3748  tm84618   Robert De Niro            Travis Bickle  ACTOR
1      14658  tm84618     Jodie Foster            Iris Steensma  ACTOR
2       

Replace missing values in imdb_score and tmdb_score with their respective mean values

In [18]:
def fill_mean_value(titles_df):
    # Code starts here
    fill_values = {
        "imdb_score":titles_df["imdb_score"].mean(),
        "tmdb_score":titles_df["tmdb_score"].mean(),
    }
    titles_df.fillna(value=fill_values, inplace=True)
    # Code ends here
    return titles_df

In [19]:
titles_df = fill_mean_value(titles_df)
titles_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          209 non-null    object 
 1   title       207 non-null    object 
 2   type        209 non-null    object 
 3   genres      209 non-null    object 
 4   imdb_score  209 non-null    float64
 5   tmdb_score  209 non-null    float64
dtypes: float64(2), object(4)
memory usage: 9.9+ KB


Retrieve only the titles from the titles_df DataFrame where both imdb_score and tmdb_score are greater than 8

In [20]:
def filter_df_score_morethan_8(titles_df):
    # code starts here
    titles_df = titles_df[(titles_df["imdb_score"] > 8) & (titles_df["tmdb_score"] > 8)]
    # code ends here
    return titles_df

In [21]:
titles_df = filter_df_score_morethan_8(titles_df)
titles_df.shape

(16, 6)

Creating Databricks Workspace and Retrieving Access Token

Executes the process of creating an Azure Databricks workspace and retrieving its access token.

This function initializes the Azure Resource Management Client to ensure the resource group exists, then uses the Azure Databricks Management Client to create a Databricks workspace in the specified location. The workspace is configured with a managed resource group and a "premium" SKU. Once created, the function returns the workspace name.

In [22]:


def create_databricks_workspace_and_get_token(
    credential: credential,
    subscription_id: str,
    resource_group_name: str,
    managed_resource_group_name: str,
    workspace_name: str,
    location: str
) -> dict:
    """
    Creates a Databricks workspace and returns the workspace URL and access token.

    Args:
        credential (DefaultAzureCredential): Azure DefaultAzureCredential.
        subscription_id (str): Azure subscription ID.
        resource_group_name (str): Name of the Azure resource group.
        managed_resource_group_name (str): Managed resource group name for Databricks.
        workspace_name (str): Name of the Databricks workspace.
        location (str): Azure region for the workspace.

    Returns:
        dict: A dictionary containing the workspace URL and access token.
    """
    # Initialize Resource Management Client
    resource_client = ResourceManagementClient(credential, subscription_id)

    # Create the resource group if it does not exist
    print(f"Ensuring resource group '{resource_group_name}' exists in '{location}'...")
    resource_client.resource_groups.create_or_update(
        resource_group_name,
        {"location": location}
    )
    print(f"Resource group '{resource_group_name}' is ready.")

    # Initialize Databricks Management Client
    databricks_client = AzureDatabricksManagementClient(credential, subscription_id)

    # Construct managed resource group ID
    managed_resource_group_id = f"/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}-managed"

    # Create the Databricks workspace
    print(f"Creating Databricks workspace '{workspace_name}' in '{location}'...")
    workspace = databricks_client.workspaces.begin_create_or_update(
        resource_group_name=resource_group_name,
        workspace_name=workspace_name,
        parameters={
            "location": location,
            "managed_resource_group_id": managed_resource_group_id,
            "sku": {"name": "premium"}
        }
    ).result()

    print(f"Databricks workspace '{workspace_name}' created successfully.")
    # print(workspace.workspace_url)
    # workspaceurl = workspace.workspace_url
    # Retrieve the workspace URL
    
 
    return workspace_name

In [23]:
workspace_name = create_databricks_workspace_and_get_token(
        credential,
        subscription_id,
        resource_group_name,
        managed_resource_group_name,
        workspace_name,
        location
    )

Ensuring resource group 'Test' exists in 'eastus'...
Resource group 'Test' is ready.
Creating Databricks workspace 'demoworkspacesi7486' in 'eastus'...
Databricks workspace 'demoworkspacesi7486' created successfully.


Retrieving Databricks HTTP Path

Fetches the Databricks workspace URL using the Azure Management API.
Retrieves the Databricks workspace properties, including the workspace URL.
Authenticates and generates an access token for secure API requests.
Queries the available SQL Warehouses within the Databricks workspace.
Extracts the HTTP path from the first available warehouse.
Returns the Databricks server hostname and the HTTP path for database connections.

In [24]:
def get_databricks_http_path(credentials_file="azure_credentials.json"):
    """
    Retrieves the Databricks HTTP Path for connection.

    Args:
        credentials_file (str): Path to Azure credentials file.

    Returns:
        tuple: (Databricks Server Hostname, HTTP Path)
    """
    
    # Get Databricks workspace URL
    workspace_url = f"https://management.azure.com/subscriptions/{subscription_id}/resourceGroups/{resource_group_name}/providers/Microsoft.Databricks/workspaces/{workspace_name}?api-version=2018-04-01"

    access_token = credential.get_token("https://management.azure.com/.default").token

    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json"
    }

    response = requests.get(workspace_url, headers=headers)
    if response.status_code != 200:
        raise Exception(f"Failed to retrieve Databricks workspace: {response.text}")

    workspace_data = response.json()
    databricks_host = workspace_data["properties"]["workspaceUrl"]

    # Get SQL Warehouses from Databricks
    sql_endpoint_url = f"https://{databricks_host}/api/2.0/sql/warehouses"
    db_access_token = credential.get_token("2ff814a6-3304-4ab8-85cb-cd0e6f879c1d/.default").token  # Databricks token

    db_headers = {
        "Authorization": f"Bearer {db_access_token}",
        "Content-Type": "application/json"
    }

    sql_response = requests.get(sql_endpoint_url, headers=db_headers)
    if sql_response.status_code != 200:
        raise Exception(f"Failed to retrieve SQL Warehouses: {sql_response.text}")

    warehouses = sql_response.json()["warehouses"]
    if not warehouses:
        raise Exception("No SQL Warehouses found in Databricks.")

    warehouse = warehouses[0]  # Selecting the first available warehouse
    http_path = warehouse["odbc_params"]["path"]

    return databricks_host, http_path

In [25]:
databricks_url, databricks_http_path = get_databricks_http_path()
print(databricks_url)
databricks_http_path

adb-134158506596936.16.azuredatabricks.net


'/sql/1.0/warehouses/59e48023dd645b8d'

create token of the databricks using the below steps 

In [26]:
databricks_token = "dapi69e4f606d12b6c6f9e2ff579a4725926-3"

Creating a Databricks Cluster

Constructs the Databricks API endpoint for cluster creation.
Formats the workspace URL to ensure proper API request formatting.
Sets up authentication headers using the provided access token.
Defines cluster configuration, including the name, Spark version, node type, and auto-termination settings.
Sends a POST request to the Databricks API to create the cluster.
Handles API responses, printing success messages or error details based on the response status.

In [27]:
import requests
import json

def create_databricks_cluster(workspaceurl, access_token: str):
    
    # Validate and format the workspace URL
    
    # API endpoint for cluster creation
    api_endpoint = "/api/2.1/clusters/create"
    api_url = workspaceurl.rstrip("/") + api_endpoint  # Ensure no trailing slash in URL

    # Headers for authentication
    headers = {
        "Authorization": f"Bearer {access_token}",
        "Content-Type": "application/json",
    }

    # Cluster configuration
    cluster_config = {
        "cluster_name": "democluster",
        "spark_version": "16.1.x-scala2.12",  # Replace with your required version
        "node_type_id": "Standard_DS3_v2",   # Choose the appropriate node type
        "autotermination_minutes": 30,
        "num_workers": 2
    }

    try:
        # Send POST request to create the cluster
        response = requests.post(api_url, headers=headers, json=cluster_config)

        if response.status_code == 200:
            print("Cluster created successfully!")
            print("Cluster Details:", response.json())
        else:
            print(f"Failed to create cluster. HTTP Status Code: {response.status_code}")
            print("Error Response:", response.text)

    except requests.exceptions.RequestException as e:
        print("An error occurred while connecting to Databricks API:")
        print(e)


In [28]:
access_token = databricks_token
workspaceurl = "https://"+databricks_url
#"https://"+databricks_url
create_databricks_cluster(workspaceurl, access_token)

Cluster created successfully!
Cluster Details: {'cluster_id': '0204-193804-q9nzsuk3'}


Establishing a Databricks SQL Connection

In [29]:
import os

def get_databricks_connection():
    """
    Establish and return a Databricks SQL connection.
    Ensure to close the connection after use.

    Returns:
        conn (sql.Connection): Databricks SQL connection object.
    """
    # Set up connection details
    DATABRICKS_SERVER_HOSTNAME = workspaceurl  # Replace with your Databricks workspace URL
    DATABRICKS_HTTP_PATH = databricks_http_path # Replace with your warehouse's HTTP Path
    DATABRICKS_ACCESS_TOKEN = access_token  # Replace with your access token

    # Establish connection
    conn = sql.connect(
        server_hostname=DATABRICKS_SERVER_HOSTNAME,
        http_path=DATABRICKS_HTTP_PATH,
        access_token=DATABRICKS_ACCESS_TOKEN
    )

    return conn

In [30]:
global conn
conn = get_databricks_connection()
conn

<databricks.sql.client.Connection at 0x753b12d49930>

Create Spark Dataframe

In [31]:
def create_spark_dataframe(titles_df, credits_df):
    titles_spark_df, credits_spark_df = None,None
     # code starts here

    # Initialize Spark Session
    spark = SparkSession.builder.appName("DataAnalysis").getOrCreate()

    # convert the titles and credits dataframes to spark dataframes
    titles_spark_df = spark.createDataFrame(titles_df)
    credits_spark_df = spark.createDataFrame(credits_df)  
    
    # code ends here
    return titles_spark_df, credits_spark_df

In [32]:
titles_spark_df, credits_spark_df = create_spark_dataframe(titles_df, credits_df)

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/04 19:43:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Data Transformation
Processing genre column data

In [33]:
def get_titles_genres_separated_spark_df(titles_spark_df):
    titles_genres_separated_spark_df = None

    # Code starts here
    # Step 1: Split the genres column into an array (handling spaces correctly)
    df_titles = titles_spark_df.withColumn("genres_array", split(col("genres"), "\\s*,\\s*"))

    # Step 2: Explode the array into multiple rows
    df_exploded = df_titles.withColumn("genre", explode(col("genres_array")))

    # Step 3: Remove 'NA' , empty strings, and empty lists ("[]") after explosion
    titles_genres_separated_spark_df = df_exploded.filter((col("genre") != "NA") & (col("genre") != "") & (col("genre") != "[]"))
    # Step 4: Trim spaces (this ensures no leading/trailing spaces remain)
    titles_genres_separated_spark_df = titles_genres_separated_spark_df.withColumn("genre", trim(col("genre")))
    titles_genres_separated_spark_df = titles_genres_separated_spark_df.select("id", "title", "type", "genre","imdb_score","tmdb_score")

    # Code ends here
    return titles_genres_separated_spark_df

In [34]:
titles_genres_separated_spark_df = get_titles_genres_separated_spark_df(titles_spark_df)
titles_genres_separated_spark_df.show()

                                                                                

+--------+--------------------+-----+--------+----------+----------+
|      id|               title| type|   genre|imdb_score|tmdb_score|
+--------+--------------------+-----+--------+----------+----------+
| tm84618|         Taxi Driver|MOVIE|   drama|       8.2|     8.179|
| tm84618|         Taxi Driver|MOVIE|   crime|       8.2|     8.179|
| ts22164|Monty Python's Fl...| SHOW|  comedy|       8.8|     8.306|
| ts22164|Monty Python's Fl...| SHOW|european|       8.8|     8.306|
| tm81728|            The Land|MOVIE|   drama|       8.1|       8.5|
| ts20681|            Seinfeld| SHOW|  comedy|       8.9|     8.301|
|tm155787|          GoodFellas|MOVIE|   drama|       8.7|     8.463|
|tm155787|          GoodFellas|MOVIE|   crime|       8.7|     8.463|
| tm22327|   Full Metal Jacket|MOVIE|     war|       8.3|       8.1|
| tm22327|   Full Metal Jacket|MOVIE|   drama|       8.3|       8.1|
|tm180542|Once Upon a Time ...|MOVIE|   crime|       8.3|     8.453|
|tm180542|Once Upon a Time ...|MOV

In [35]:
titles_genres_separated_spark_df.count()

48

Merge the titles_genres_separated_spark_df with the credits DataFrame to include actor details

In [36]:
def get_transformed_data(titles_genres_separated_spark_df,credits_spark_df):
    transformed_data = None
    # code starts here
    # Perform an INNER JOIN on the 'id' column (title ID)
    df_joined = titles_genres_separated_spark_df.join(credits_spark_df, titles_genres_separated_spark_df.id == credits_spark_df.id, "inner")

    # Select relevant columns: title, type, genre, actor name, and role
    transformed_data = df_joined.select(
        titles_genres_separated_spark_df.id, 
        titles_genres_separated_spark_df.title, 
        titles_genres_separated_spark_df.type, 
        titles_genres_separated_spark_df.genre, 
        credits_spark_df.name.alias("actor_name"), 
        credits_spark_df.role,
        titles_genres_separated_spark_df.imdb_score,
        titles_genres_separated_spark_df.tmdb_score
    )
    # Code ends here
    return transformed_data

In [37]:
transformed_data = get_transformed_data(titles_genres_separated_spark_df,credits_spark_df)
transformed_data.show()

+-------+-----------+-----+-----+------------------+-----+----------+----------+
|     id|      title| type|genre|        actor_name| role|imdb_score|tmdb_score|
+-------+-----------+-----+-----+------------------+-----+----------+----------+
|tm84618|Taxi Driver|MOVIE|drama|    Robert De Niro|ACTOR|       8.2|     8.179|
|tm84618|Taxi Driver|MOVIE|drama|      Jodie Foster|ACTOR|       8.2|     8.179|
|tm84618|Taxi Driver|MOVIE|drama|     Albert Brooks|ACTOR|       8.2|     8.179|
|tm84618|Taxi Driver|MOVIE|drama|     Harvey Keitel|ACTOR|       8.2|     8.179|
|tm84618|Taxi Driver|MOVIE|drama|   Cybill Shepherd|ACTOR|       8.2|     8.179|
|tm84618|Taxi Driver|MOVIE|drama|       Peter Boyle|ACTOR|       8.2|     8.179|
|tm84618|Taxi Driver|MOVIE|drama|    Leonard Harris|ACTOR|       8.2|     8.179|
|tm84618|Taxi Driver|MOVIE|drama|    Diahnne Abbott|ACTOR|       8.2|     8.179|
|tm84618|Taxi Driver|MOVIE|drama|       Gino Ardito|ACTOR|       8.2|     8.179|
|tm84618|Taxi Driver|MOVIE|d

In [38]:
transformed_data.count()

84