In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, concat_ws, row_number, desc
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, TimestampType

In [3]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName('postgresql_connection') \
    .getOrCreate()

In [4]:
# PostgreSQL connection parameters
database = "global_development"
user = "postgres"
password = "password"
url = f"jdbc:postgresql://global-db:5432/{database}"

In [5]:
# Read table names from PostgreSQL metadata
table_names = spark.read \
    .format("jdbc") \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "(SELECT table_name FROM information_schema.tables WHERE table_schema = 'public') as tables") \
    .load()

In [6]:


# Convert the DataFrame to a list of table names
table_list = table_names.select("table_name").rdd.flatMap(lambda x: x).collect()

# Print the list of table names
print("Available Tables:")
for table_name in table_list:
    print(table_name)

# Now you can use this list to access each table individually

Available Tables:
ar_internal_metadata
companies
company_annual_revenues
company_customers
company_events
company_funding_round_investors
company_funding_rounds
company_headcounts
company_locations
company_names
company_sectors
company_social_urls
company_stock_tickers
people
person_customers
person_educations
person_employments
person_social_urls
schema_migrations
processed_data


In [7]:
# Function to load data from a table into a DataFrame
def load_table(table_name):
    df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "org.postgresql.Driver") \
        .load()
    return df

In [None]:
# Load data from the people related tables into a DataFrame
people_df = load_table("people")
person_customer_df = load_table("person_customers")
person_educations_df = load_table("person_educations")
person_employments_df = load_table("person_employments")
person_social_urls_df = load_table("person_social_urls")


In [None]:
#function to check the uniqueness of the dataframe
def check_uniqueness(df, column_name):
    # Add a new column 'is_duplicate' that flags if column_name is duplicated
    df_duplicates_check = df.withColumn('is_duplicate', count(column_name).over(Window.partitionBy(column_name)) > 1)

    # If any 'is_duplicate' is True, then DataFrame is not unique based on column_name
    if df_duplicates_check.filter(col('is_duplicate')).count() > 0:
        print(f"DataFrame is not unique based on {column_name}")
    else:
        print(f"DataFrame is unique based on {column_name}")

In [None]:
check_uniqueness(people_df, 'id')

In [None]:
# Select specific columns from the 'people_df' DataFrame, rename the 'id' column to 'person_id',
# and concatenate 'address', 'city', 'region', 'postal_code', 'country' columns with a comma separator
people_selected_df = people_df.select('id', 'name', 'address', 'city', 'region', 'postal_code', 'country', 'headline','description', 'created_at', 'updated_at')\
                              .withColumnRenamed('id', 'person_id')\
                              .withColumn('address', concat_ws(', ', 'address', 'city', 'region', 'postal_code', 'country'))

people_selected_df = people_selected_df.select('person_id', 'name', 'address','headline','description', 'created_at', 'updated_at')

# people_selected_df.show()

***Transformation for the "person_customer" table***

In [None]:
person_customer_df.show()

In [None]:
# Call the 'check_uniqueness' function on 'person_customer_df' DataFrame to check if 'person_id' is unique
check_uniqueness(person_customer_df,'person_id')

In [None]:
# Group by 'person_id' and count the number of occurrences of each 'person_id'
duplicate_rows = person_customer_df.groupBy('person_id').agg(count('*').alias('count'))

# Filter the rows where 'count' is greater than 1 (i.e., 'person_id' is duplicated)
duplicate_rows = duplicate_rows.filter(duplicate_rows['count'] > 1)

# # Show the duplicate rows
# duplicate_rows.show()

In [None]:
# Define a window partitioned by person_id and ordered by updated_at in descending order
window = Window.partitionBy("person_id").orderBy(desc("updated_at"))

# Add a row_number column to the DataFrame
person_customer_df = person_customer_df.withColumn("rn", row_number().over(window))

# Filter the DataFrame to keep only the rows with rn = 1 (i.e., the latest updated_at for each person_id)
person_customer_df = person_customer_df.filter(person_customer_df.rn == 1)

# Drop the rn column
person_customer_df = person_customer_df.drop("rn")

# # Print the DataFrame to verify the result
# person_customer_df.show()

In [None]:
# Call the 'check_uniqueness' function on 'person_customer_df' DataFrame to check if 'person_id' is unique
check_uniqueness(person_customer_df,'person_id')

In [None]:
# Select 'person_id' and 'customer_id' columns from 'person_customer_df' DataFrame
people_selected_customer_df = person_customer_df.select('person_id', 'customer_id')

# Join 'people_selected_df' with 'people_selected_customer_df' on 'person_id'
people_person_customer_df = people_selected_df.join(people_selected_customer_df, on='person_id', how='inner')

# people_person_customer_df.show()

***Transformation for "Person Education" table***

In [None]:
person_educations_df.show()

In [None]:
# Define a struct for each institute
institute_struct = F.struct(
    "institution_id", "institution_name", "degree", "subject", "started_on", "ended_on"
)

# Group by person_id and collect list of institute details as structs
grouped_educations_df = person_educations_df.groupBy("person_id").agg(
    F.collect_list(institute_struct).alias("education")
    # ("education - (institution_id, institution_name, degree, subject, started_on, ended_on)")
)

person_education_group_df = grouped_educations_df


# Show the result
person_education_group_df.show(truncate=False)


In [None]:
check_uniqueness(person_education_group_df, 'person_id')

In [None]:
# Joining person_education_group_df and people_person_customer_df DataFrames on column named 'person_id'

people_person_customer_education_df = people_person_customer_df.join(person_education_group_df, on='person_id') 

people_person_customer_education_df.show()

In [None]:
# Group by person_id and collect list of URL details
grouped_urls_df = person_social_urls_df.groupBy("person_id").agg(
    F.collect_list(
        F.struct("url_type", "url")
    ).alias("urls")
)

# Show the result
grouped_urls_df.show(truncate=False)


In [None]:
check_uniqueness(grouped_urls_df, 'person_id')

In [None]:
# Joining person_education_group_df and people_person_customer_df DataFrames on column named 'person_id'

people_person_customer_education_social_urls_df = people_person_customer_education_df.join(grouped_urls_df, on='person_id') 

people_person_customer_education_social_urls_df.show()


In [None]:
people_person_customer_education_social_urls_df.show()

In [None]:
# Assuming that df is your DataFrame
df = people_person_customer_education_social_urls_df

# Convert DataFrame to JSON
json_df = df.toJSON().collect()


In [1]:
import json

# Assuming json_df is your data
with open('final_result/person_final_output.json', 'w') as f:
    json.dump(json_df, f)


NameError: name 'json_df' is not defined