In [1]:
from pyspark.sql import SparkSession

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName('postgresql_connection') \
    .getOrCreate()

In [3]:
# PostgreSQL connection parameters
database = "skyminyr_development"
user = "postgres"
password = "password"
url = f"jdbc:postgresql://global-db:5432/{database}"

In [4]:
# Read table names from PostgreSQL metadata
table_names = spark.read \
    .format("jdbc") \
    .option("url", url) \
    .option("user", user) \
    .option("password", password) \
    .option("driver", "org.postgresql.Driver") \
    .option("dbtable", "(SELECT table_name FROM information_schema.tables WHERE table_schema = 'public') as tables") \
    .load()

In [5]:


# Convert the DataFrame to a list of table names
table_list = table_names.select("table_name").rdd.flatMap(lambda x: x).collect()

# Print the list of table names
print("Available Tables:")
for table_name in table_list:
    print(table_name)

# Now you can use this list to access each table individually

Available Tables:
ar_internal_metadata
companies
company_annual_revenues
company_customers
company_events
company_funding_round_investors
company_funding_rounds
company_headcounts
company_locations
company_names
company_sectors
company_social_urls
company_stock_tickers
people
person_customers
person_educations
person_employments
person_social_urls
schema_migrations


In [6]:
# Function to load data from a table into a DataFrame
def load_table(table_name):
    df = spark.read \
        .format("jdbc") \
        .option("url", url) \
        .option("dbtable", table_name) \
        .option("user", user) \
        .option("password", password) \
        .option("driver", "org.postgresql.Driver") \
        .load()
    return df

# Load data from the "people" table into a DataFrame
people_df = load_table("people")

people_df.show(5)
people_df = people_df.withColumnRenamed("address", "address_ppl")
people_df = people_df.withColumnRenamed("id", "person_id_ppl")
people_df = people_df.withColumnRenamed("created_at", "created_at_ppl")
people_df = people_df.withColumnRenamed("updated_at", "updated_at_ppl")
people_df.show(5)

+---------+-----------------+--------------------+--------------------+-------+-----------+--------------+-----------+-------+------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+
|       id|             name|        linkedin_url|           photo_url|address|       city|        region|postal_code|country|company_name|seniority_level|         description|               title|            headline|started_on|  ended_on|company_id|          created_at|          updated_at|
+---------+-----------------+--------------------+--------------------+-------+-----------+--------------+-----------+-------+------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+
|693540132|  Lika Razac-Ince|https://www.linke...|https://media-exp...|   NULL|       NULL|          NULL|       NULL|

In [7]:
person_customer_df = load_table("person_customers")
person_customer_df = person_customer_df.withColumnRenamed("person_id", "person_id_pc")
person_customer_df = person_customer_df.withColumnRenamed("created_at", "created_at_pc")
person_customer_df = person_customer_df.withColumnRenamed("updated_at", "updated_at_pc")
person_customer_df = person_customer_df.withColumnRenamed("id", "id_pc")
person_customer_df.show(5)

+-----+------------+--------------------+--------------------+--------------------+
|id_pc|person_id_pc|         customer_id|       created_at_pc|       updated_at_pc|
+-----+------------+--------------------+--------------------+--------------------+
|    1|   693540132|14b76a2f-4545-4fb...|2023-12-18 11:12:...|2023-12-18 11:12:...|
|    2|    52762066|7538cb11-1c23-4c7...|2024-01-08 12:16:...|2024-01-08 12:16:...|
|    3|     1567835|7538cb11-1c23-4c7...|2024-01-16 13:22:...|2024-01-16 13:22:...|
|    4|    16135235|7538cb11-1c23-4c7...|2024-01-16 13:26:...|2024-01-16 13:26:...|
|    5|    48171849|7538cb11-1c23-4c7...|2024-01-16 13:29:...|2024-01-16 13:29:...|
+-----+------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [8]:
from pyspark.sql.functions import count
#filt_df = person_customer_df.where(person_customer_df.person_id_pc == 693540132)
#filt_df.show()

In [9]:
# Group by person_id and count the occurrences
grouped_df = person_customer_df.groupBy("person_id_pc").agg(count("*").alias("count"))


# Filter rows with count > 1
filtered_df = grouped_df.filter(grouped_df["count"] > 1)
filtered_df.show(5)

# Join with original DataFrame to get the rows
result_df = person_customer_df.join(filtered_df, "person_id_pc", "inner")

# Show the result
result_df.show()

+------------+-----+
|person_id_pc|count|
+------------+-----+
|   693540132|    2|
+------------+-----+

+------------+-----+--------------------+--------------------+--------------------+-----+
|person_id_pc|id_pc|         customer_id|       created_at_pc|       updated_at_pc|count|
+------------+-----+--------------------+--------------------+--------------------+-----+
|   693540132|    6|7538cb11-1c23-4c7...|2024-01-19 10:13:...|2024-01-19 10:13:...|    2|
|   693540132|    1|14b76a2f-4545-4fb...|2023-12-18 11:12:...|2023-12-18 11:12:...|    2|
+------------+-----+--------------------+--------------------+--------------------+-----+



In [10]:
person_customer_df.show(5)
#filt_df = person_customer_df.where(person_customer_df.person_id_pc == 693540132)
#filt_df.show()

+-----+------------+--------------------+--------------------+--------------------+
|id_pc|person_id_pc|         customer_id|       created_at_pc|       updated_at_pc|
+-----+------------+--------------------+--------------------+--------------------+
|    1|   693540132|14b76a2f-4545-4fb...|2023-12-18 11:12:...|2023-12-18 11:12:...|
|    2|    52762066|7538cb11-1c23-4c7...|2024-01-08 12:16:...|2024-01-08 12:16:...|
|    3|     1567835|7538cb11-1c23-4c7...|2024-01-16 13:22:...|2024-01-16 13:22:...|
|    4|    16135235|7538cb11-1c23-4c7...|2024-01-16 13:26:...|2024-01-16 13:26:...|
|    5|    48171849|7538cb11-1c23-4c7...|2024-01-16 13:29:...|2024-01-16 13:29:...|
+-----+------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [11]:
from pyspark.sql import functions as F

# Group by "person_id" and concatenate values of other columns
unique_person_customer_df = person_customer_df.groupBy("person_id_pc").agg(
    F.concat_ws(",", F.collect_list("created_at_pc")).alias("created_at_pc"),
    F.concat_ws(",", F.collect_list("customer_id")).alias("customer_id_pc"),
    F.concat_ws(",", F.collect_list("id_pc")).alias("id_pc"),
    F.concat_ws(",", F.collect_list("updated_at_pc")).alias("updated_at_pc")
)

# # Show the unique DataFrame
unique_person_customer_df.show(5)

+------------+--------------------+--------------------+-----+--------------------+
|person_id_pc|       created_at_pc|      customer_id_pc|id_pc|       updated_at_pc|
+------------+--------------------+--------------------+-----+--------------------+
|      101255|2024-02-07 11:24:...|7538cb11-1c23-4c7...|    7|2024-02-07 11:24:...|
|      124469|2024-02-07 11:24:...|7538cb11-1c23-4c7...|    8|2024-02-07 11:24:...|
|      149312|2024-02-07 11:24:...|7538cb11-1c23-4c7...|    9|2024-02-07 11:24:...|
|      175688|2024-02-07 11:24:...|7538cb11-1c23-4c7...|   10|2024-02-07 11:24:...|
|      225814|2024-02-07 11:24:...|7538cb11-1c23-4c7...|   11|2024-02-07 11:24:...|
+------------+--------------------+--------------------+-----+--------------------+
only showing top 5 rows



In [12]:
# # Filter the unique DataFrame for person_id 693540132
# specific_person_df = unique_person_customer_df.filter(unique_person_customer_df["person_id"] == "693540132")

# # Show the DataFrame for person_id 693540132
# specific_person_df.show(truncate=False)

In [13]:
# Count the distinct occurrences of person_id
distinct_count = unique_person_customer_df.select("person_id_pc").distinct().count()

# Count the total number of rows in the DataFrame
total_rows = unique_person_customer_df.count()

# Check if the DataFrame is unique based on person_id
if distinct_count == total_rows:
    print("DataFrame is unique based on person_id_pc")
else:
    print("DataFrame is not unique based on person_id_pc")


DataFrame is unique based on person_id_pc


In [14]:
unique_person_customer_df.show(5)

+------------+--------------------+--------------------+-----+--------------------+
|person_id_pc|       created_at_pc|      customer_id_pc|id_pc|       updated_at_pc|
+------------+--------------------+--------------------+-----+--------------------+
|      101255|2024-02-07 11:24:...|7538cb11-1c23-4c7...|    7|2024-02-07 11:24:...|
|      124469|2024-02-07 11:24:...|7538cb11-1c23-4c7...|    8|2024-02-07 11:24:...|
|      149312|2024-02-07 11:24:...|7538cb11-1c23-4c7...|    9|2024-02-07 11:24:...|
|      175688|2024-02-07 11:24:...|7538cb11-1c23-4c7...|   10|2024-02-07 11:24:...|
|      225814|2024-02-07 11:24:...|7538cb11-1c23-4c7...|   11|2024-02-07 11:24:...|
+------------+--------------------+--------------------+-----+--------------------+
only showing top 5 rows



In [15]:
people_df.show(5)

+-------------+-----------------+--------------------+--------------------+-----------+-----------+--------------+-----------+-------+------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+
|person_id_ppl|             name|        linkedin_url|           photo_url|address_ppl|       city|        region|postal_code|country|company_name|seniority_level|         description|               title|            headline|started_on|  ended_on|company_id|      created_at_ppl|      updated_at_ppl|
+-------------+-----------------+--------------------+--------------------+-----------+-----------+--------------+-----------+-------+------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+
|    693540132|  Lika Razac-Ince|https://www.linke...|https://media-exp...|       NULL|       

In [16]:
# Join people_df and unique_person_customer_df based on person_id of unique_person_customer_df and id of people_df
person_customer_joined_df = unique_person_customer_df.join(
    people_df,
    unique_person_customer_df["person_id_pc"] == people_df["person_id_ppl"],
    "inner"
)

# Show the joined DataFrame
person_customer_joined_df.show(5)


+------------+--------------------+--------------------+-----+--------------------+-------------+-----------------+--------------------+--------------------+-----------+------+------+-----------+---------+----------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+
|person_id_pc|       created_at_pc|      customer_id_pc|id_pc|       updated_at_pc|person_id_ppl|             name|        linkedin_url|           photo_url|address_ppl|  city|region|postal_code|  country|    company_name|seniority_level|         description|               title|            headline|started_on|  ended_on|company_id|      created_at_ppl|      updated_at_ppl|
+------------+--------------------+--------------------+-----+--------------------+-------------+-----------------+--------------------+--------------------+-----------+------+------+-----------+---------+----------------+---------------+--------

In [17]:
person_educations_df = load_table("person_educations")
person_educations_df.show(5)
person_educations_df =person_educations_df.withColumnRenamed("person_id","person_id_ped")
person_educations_df.show(5)

+-----+---------+--------------+--------------------+--------------------+------------------+----------+----------+--------------------+----------+
|   id|person_id|institution_id|    institution_name|              degree|           subject|started_on|  ended_on|          created_at|updated_at|
+-----+---------+--------------+--------------------+--------------------+------------------+----------+----------+--------------------+----------+
|12091| 64876372|          NULL|University Of Win...|Bachelors;Bachelo...|           Geology|1980-01-01|1984-01-01|2023-12-11 06:18:...|      NULL|
|12092| 64876372|          NULL|University Of Win...|Master Of Science...|           Geology|1984-01-01|1986-01-01|2023-12-11 06:18:...|      NULL|
|12093| 64876372|          NULL| La Trobe University|Doctorates;Doctor...|     Earth Science|1988-01-01|1992-01-01|2023-12-11 06:18:...|      NULL|
|12094| 67121188|       8522590|Keller Graduate S...|                NULL|Project Management|2013-01-01|2015-01-

In [18]:
from pyspark.sql import functions as F

# Define a struct for each institute
institute_struct = F.struct(
    "institution_id", "institution_name", "degree", "subject", "started_on", "ended_on"
)

# Group by person_id and collect list of institute details as structs
grouped_educations_df = person_educations_df.groupBy("person_id_ped").agg(
    F.collect_list(institute_struct).alias("institute"), 
    # ("institute - (institution_id, institution_name, degree, subject, started_on, ended_on)"),
    F.max("created_at").alias("latest_edu_created_at"),
    F.max("updated_at").alias("latest_edu_updated_at")
)

# Show the result
grouped_educations_df.show(5,truncate=False)


+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+---------------------+
|person_id_ped|institute                                                                                                                                                                                                                                                                  |latest_edu_created_at     |latest_edu_updated_at|
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+---------------------+
|

In [19]:
# Count the distinct occurrences of person_id
distinct_count = grouped_educations_df.select("person_id_ped").distinct().count()

# Count the total number of rows in the DataFrame
total_rows = grouped_educations_df.count()

# Check if the DataFrame is unique based on person_id
if distinct_count == total_rows:
    print("DataFrame is unique based on person_id")
else:
    print("DataFrame is not unique based on person_id")


DataFrame is unique based on person_id


In [20]:
from pyspark.sql.functions import col
# Set the option to display without truncation
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

# Filter the DataFrame to select rows where person_id_ped = 64876372
filtered_df = grouped_educations_df.filter(col("person_id_ped") == 64876372)

# Show the DataFrame
filtered_df.show(truncate=False)

+-------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------+---------------------+
|person_id_ped|institute                                                                                                                                                                                                                                                                                      |latest_edu_created_at     |latest_edu_updated_at|
+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [21]:


# Join people_people_customer_joined_df and grouped_educations_df based on person_id
person_customer_education_joined_df = person_customer_joined_df.join(
    grouped_educations_df,
    person_customer_joined_df.person_id_pc == grouped_educations_df.person_id_ped,
    "inner"
)

# # Add filter to remove rows where Institute column is null
person_customer_education_joined_df = person_customer_education_joined_df.filter(
    col("institute").isNotNull()
).drop(grouped_educations_df.person_id_ped)

# Show the result
person_customer_education_joined_df.show(5)


+------------+--------------------+--------------------+-----+--------------------+-------------+---------------+--------------------+--------------------+-----------+-------+--------+-----------+---------+----------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+--------------------+---------------------+---------------------+
|person_id_pc|       created_at_pc|      customer_id_pc|id_pc|       updated_at_pc|person_id_ppl|           name|        linkedin_url|           photo_url|address_ppl|   city|  region|postal_code|  country|    company_name|seniority_level|         description|               title|            headline|started_on|  ended_on|company_id|      created_at_ppl|      updated_at_ppl|           institute|latest_edu_created_at|latest_edu_updated_at|
+------------+--------------------+--------------------+-----+--------------------+-------------+---------------+-

In [22]:
# Drop the specified columns
person_customer_education_joined_df = person_customer_education_joined_df.drop("linkedin_url", "photo_url")

# Show the trimmed DataFrame
person_customer_education_joined_df.show(5, truncate=False)


+------------+--------------------------+------------------------------------+-----+--------------------------+-------------+---------------+-----------+-------+--------+-----------+---------+----------------+---------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------+-----------------------------------------------------------------+----------+----------+----------+-------------------------+--------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [23]:
person_employments_df = load_table("person_employments")
person_employments_df.show(2)

+-----+---------+----------+--------------------+---------------+--------------------+----------+----------+--------------------+----------+
|   id|person_id|company_id|        company_name|seniority_level|               title|started_on|  ended_on|          created_at|updated_at|
+-----+---------+----------+--------------------+---------------+--------------------+----------+----------+--------------------+----------+
|78981|693540132|      NULL|                NULL|           NULL|Professional Phot...|2018-04-01|      NULL|2023-12-11 06:18:...|      NULL|
|78982|693540132|   8911699|Interventional Pa...|    Head of ...|     Head of Digital|2006-11-01|2012-04-01|2023-12-11 06:18:...|      NULL|
+-----+---------+----------+--------------------+---------------+--------------------+----------+----------+--------------------+----------+
only showing top 2 rows



In [24]:
from pyspark.sql import functions as F

# Define a struct for each employment
employment_struct = F.struct(
    "company_id", "company_name", "seniority_level", "title", "started_on", "ended_on"
)

# Group by person_id and collect list of employment details as structs
grouped_employments_df = person_employments_df.groupBy("person_id").agg(
    F.collect_list(employment_struct).alias("employments"),
    F.max("created_at").alias("latest_emp_created_at"),
    F.max("updated_at").alias("latest_emp_updated_at")
)

grouped_employments_df = grouped_employments_df.withColumnRenamed("person_id","person_id_emp")
# Show the result
grouped_employments_df.show(5,truncate=False)


+-------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [25]:
# Join person_customer_education_joined_df and grouped_employments_df based on person_id
final_df = person_customer_education_joined_df.join(
    grouped_employments_df,
    person_customer_education_joined_df.person_id_pc == grouped_employments_df.person_id_emp,
    "inner"
).drop(grouped_employments_df.person_id_emp)

# Show the result
final_df.show(5)


+------------+--------------------+--------------------+-----+--------------------+-------------+---------------+-----------+-------+--------+-----------+---------+----------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+--------------------+---------------------+---------------------+--------------------+---------------------+---------------------+
|person_id_pc|       created_at_pc|      customer_id_pc|id_pc|       updated_at_pc|person_id_ppl|           name|address_ppl|   city|  region|postal_code|  country|    company_name|seniority_level|         description|               title|            headline|started_on|  ended_on|company_id|      created_at_ppl|      updated_at_ppl|           institute|latest_edu_created_at|latest_edu_updated_at|         employments|latest_emp_created_at|latest_emp_updated_at|
+------------+--------------------+--------------------+-----+------

In [26]:
person_social_urls_df = load_table("person_social_urls")
person_social_urls_df.show(5)

+-----+---------+--------------------+--------+--------------------+----------+
|   id|person_id|                 url|url_type|          created_at|updated_at|
+-----+---------+--------------------+--------+--------------------+----------+
|12791|693540132|https://www.linke...|linkedin|2023-12-11 06:18:...|      NULL|
|12792|118418234|https://www.linke...|linkedin|2023-12-11 06:18:...|      NULL|
|12793|118418234|facebook.com/kenn...|facebook|2023-12-11 06:18:...|      NULL|
|12794| 64876372|https://www.linke...|linkedin|2023-12-11 06:18:...|      NULL|
|12795| 64876372|facebook.com/docl...|facebook|2023-12-11 06:18:...|      NULL|
+-----+---------+--------------------+--------+--------------------+----------+
only showing top 5 rows



In [27]:
from pyspark.sql import functions as F

# Group by person_id and collect list of URL details
grouped_urls_df = person_social_urls_df.groupBy("person_id").agg(
    F.collect_list(
        F.struct("url_type", "url")
    ).alias("urls"),
    F.max("created_at").alias("latest_url_created_at"),
    F.max("updated_at").alias("latest_url_updated_at")
)

# Show the result

grouped_urls_df = grouped_urls_df.withColumnRenamed("person_id","person_id_url")
grouped_urls_df.show(5,truncate=False)


+-------------+------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+---------------------+
|person_id_url|urls                                                                                                                                      |latest_url_created_at    |latest_url_updated_at|
+-------------+------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+---------------------+
|101255       |[{linkedin, https://www.linkedin.com/in/richardshusterman}, {crunchbase, https://crunchbase.com/person/richard-shusterman}]               |2023-12-11 06:18:57.46109|NULL                 |
|124469       |[{linkedin, https://www.linkedin.com/in/drazvan}, {crunchbase, https://crunchbase.com/person/razvan-dinu}]                                |2023-12-11 06:18:57.46109|NULL    

In [28]:
#grouped_urls_df = grouped_urls_df.drop("urls")
#grouped_urls_df.show(5,truncate=False)

In [29]:
# Join final_df and grouped_urls_df based on person_id
result_df = final_df.join(
    grouped_urls_df,
    final_df.person_id_pc == grouped_urls_df.person_id_url,
    "inner"
).drop(grouped_urls_df.person_id_url)

# Show the result
result_df.show(5)


+------------+--------------------+--------------------+-----+--------------------+-------------+---------------+-----------+-------+--------+-----------+---------+----------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+--------------------+---------------------+---------------------+--------------------+---------------------+---------------------+--------------------+---------------------+---------------------+
|person_id_pc|       created_at_pc|      customer_id_pc|id_pc|       updated_at_pc|person_id_ppl|           name|address_ppl|   city|  region|postal_code|  country|    company_name|seniority_level|         description|               title|            headline|started_on|  ended_on|company_id|      created_at_ppl|      updated_at_ppl|           institute|latest_edu_created_at|latest_edu_updated_at|         employments|latest_emp_created_at|latest_emp_updated_at|    

In [30]:
# Count the distinct occurrences of person_id
distinct_count = result_df.select("person_id_pc").distinct().count()

# Count the total number of rows in the DataFrame
total_rows = result_df.count()

# Check if the DataFrame is unique based on person_id
if distinct_count == total_rows:
    print("DataFrame is unique based on person_id")
else:
    print("DataFrame is not unique based on person_id")

DataFrame is unique based on person_id


In [31]:
# Define the columns to show
columns_to_show = ["person_id_pc", "name", "description", "title", "headline", "city", "region", "postal_code", "country","employments", "institute","urls"]

# Select the specified columns and show the result
result_df.select(columns_to_show).show(5)


+------------+---------------+--------------------+--------------------+--------------------+-------+--------+-----------+---------+--------------------+--------------------+--------------------+
|person_id_pc|           name|         description|               title|            headline|   city|  region|postal_code|  country|         employments|           institute|                urls|
+------------+---------------+--------------------+--------------------+--------------------+-------+--------+-----------+---------+--------------------+--------------------+--------------------+
|   949256266|    Vivian Weng|                NULL|Vice President, I...|Product Design, S...|   NULL|    NULL|       NULL|SINGAPORE|[{15861, DBS Bank...|[{NULL, Universit...|[{linkedin, https...|
|    48171849|Courtney Turner|The search for so...|    Creative Trainer|                NULL| Toledo|    Ohio|       NULL|       US|[{1660, Apple, NU...|[{NULL, The Unive...|[{linkedin, https...|
|    74707094|  Kati

In [32]:
# Select the specified columns and show the first row
result_df.select(columns_to_show).first()

Row(person_id_pc=949256266, name='Vivian Weng', description=None, title='Vice President, Innovation Group', headline='Product Design, Strategy & Managment', city=None, region=None, postal_code=None, country='SINGAPORE', employments=[Row(company_id=15861, company_name='DBS Bank', seniority_level='VP', title='Vice President, Innovation Group', started_on=datetime.date(2016, 7, 1), ended_on=datetime.date(2019, 4, 1)), Row(company_id=7962172, company_name='Frog', seniority_level='Director', title='Associate Strategy Director, Innovation Strategy Group', started_on=datetime.date(2012, 2, 1), ended_on=datetime.date(2016, 6, 1)), Row(company_id=4007, company_name='McKinsey & Company', seniority_level=None, title='Business Analyst', started_on=datetime.date(2004, 1, 1), ended_on=datetime.date(2007, 1, 1)), Row(company_id=4007, company_name='McKinsey & Company', seniority_level=None, title='Senior Associate, Corporate Finance', started_on=datetime.date(2009, 9, 1), ended_on=datetime.date(2011, 

In [33]:
result_df.show(5)
# paruest_ds = result_df.write.parquet("output.parquet")

+------------+--------------------+--------------------+-----+--------------------+-------------+---------------+-----------+-------+--------+-----------+---------+----------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+--------------------+---------------------+---------------------+--------------------+---------------------+---------------------+--------------------+---------------------+---------------------+
|person_id_pc|       created_at_pc|      customer_id_pc|id_pc|       updated_at_pc|person_id_ppl|           name|address_ppl|   city|  region|postal_code|  country|    company_name|seniority_level|         description|               title|            headline|started_on|  ended_on|company_id|      created_at_ppl|      updated_at_ppl|           institute|latest_edu_created_at|latest_edu_updated_at|         employments|latest_emp_created_at|latest_emp_updated_at|    

In [34]:
rows = result_df.select(columns_to_show).collect()

# Loop through each row and perform some operation
for row in rows:
    # Your operation goes here
    # For example, let's print each row as a string
    print(row.asDict())

{'person_id_pc': 949256266, 'name': 'Vivian Weng', 'description': None, 'title': 'Vice President, Innovation Group', 'headline': 'Product Design, Strategy & Managment', 'city': None, 'region': None, 'postal_code': None, 'country': 'SINGAPORE', 'employments': [Row(company_id=15861, company_name='DBS Bank', seniority_level='VP', title='Vice President, Innovation Group', started_on=datetime.date(2016, 7, 1), ended_on=datetime.date(2019, 4, 1)), Row(company_id=7962172, company_name='Frog', seniority_level='Director', title='Associate Strategy Director, Innovation Strategy Group', started_on=datetime.date(2012, 2, 1), ended_on=datetime.date(2016, 6, 1)), Row(company_id=4007, company_name='McKinsey & Company', seniority_level=None, title='Business Analyst', started_on=datetime.date(2004, 1, 1), ended_on=datetime.date(2007, 1, 1)), Row(company_id=4007, company_name='McKinsey & Company', seniority_level=None, title='Senior Associate, Corporate Finance', started_on=datetime.date(2009, 9, 1), en

In [35]:
result_df.printSchema()

root
 |-- person_id_pc: long (nullable = true)
 |-- created_at_pc: string (nullable = false)
 |-- customer_id_pc: string (nullable = false)
 |-- id_pc: string (nullable = false)
 |-- updated_at_pc: string (nullable = false)
 |-- person_id_ppl: long (nullable = true)
 |-- name: string (nullable = true)
 |-- address_ppl: string (nullable = true)
 |-- city: string (nullable = true)
 |-- region: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- seniority_level: string (nullable = true)
 |-- description: string (nullable = true)
 |-- title: string (nullable = true)
 |-- headline: string (nullable = true)
 |-- started_on: date (nullable = true)
 |-- ended_on: date (nullable = true)
 |-- company_id: long (nullable = true)
 |-- created_at_ppl: timestamp (nullable = true)
 |-- updated_at_ppl: timestamp (nullable = true)
 |-- institute: array (nullable = false)
 |    |-- element: struct (con

In [38]:
#result_df = result_df.drop("created_at")
# Write the DataFrame to a Parquet file, overwriting if it already exists
result_df.write.mode("overwrite").parquet("final_output/df.parquet")



In [41]:
# Read the parquet file and create a new DataFrame
new_df = spark.read.parquet("final_output/df.parquet")

# Display a row
new_df.show(10)

+------------+--------------------+--------------------+-----+--------------------+-------------+----------------+-----------+-------------+----------+-----------+---------+--------------------+---------------+--------------------+--------------------+--------------------+----------+----------+----------+--------------------+--------------------+--------------------+---------------------+---------------------+--------------------+---------------------+---------------------+--------------------+---------------------+---------------------+
|person_id_pc|       created_at_pc|      customer_id_pc|id_pc|       updated_at_pc|person_id_ppl|            name|address_ppl|         city|    region|postal_code|  country|        company_name|seniority_level|         description|               title|            headline|started_on|  ended_on|company_id|      created_at_ppl|      updated_at_ppl|           institute|latest_edu_created_at|latest_edu_updated_at|         employments|latest_emp_created_at|

In [42]:
print(type(new_df))
new_df.printSchema()

<class 'pyspark.sql.dataframe.DataFrame'>
root
 |-- person_id_pc: long (nullable = true)
 |-- created_at_pc: string (nullable = true)
 |-- customer_id_pc: string (nullable = true)
 |-- id_pc: string (nullable = true)
 |-- updated_at_pc: string (nullable = true)
 |-- person_id_ppl: long (nullable = true)
 |-- name: string (nullable = true)
 |-- address_ppl: string (nullable = true)
 |-- city: string (nullable = true)
 |-- region: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- country: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- seniority_level: string (nullable = true)
 |-- description: string (nullable = true)
 |-- title: string (nullable = true)
 |-- headline: string (nullable = true)
 |-- started_on: date (nullable = true)
 |-- ended_on: date (nullable = true)
 |-- company_id: long (nullable = true)
 |-- created_at_ppl: timestamp (nullable = true)
 |-- updated_at_ppl: timestamp (nullable = true)
 |-- institute: array (nullable =

In [43]:


# Assuming your DataFrame is named 'new_df'
# Select the columns you want to concatenate into a single string
columns_to_combine = ["person_id_pc", "created_at_pc", "customer_id_pc", "id_pc", "updated_at_pc", "person_id_ppl",
                      "name", "address_ppl", "city", "region", "postal_code", "country", "company_name",
                      "seniority_level", "description", "title", "headline", "started_on", "ended_on", "company_id",
                      "created_at_ppl", "updated_at_ppl", "institute", "latest_edu_created_at", "latest_edu_updated_at",
                      "employments", "latest_emp_created_at", "latest_emp_updated_at", "urls", "latest_url_created_at",
                      "latest_url_updated_at"]

# Combine selected columns into a single column
combined_column = concat_ws(", ", *[col(col_name).cast("string") for col_name in columns_to_combine])

# Add the combined column to the DataFrame
df_with_combined = new_df.withColumn("combined_row", combined_column)

# Show the DataFrame with the combined row
df_with_combined.select("combined_row").show(1,truncate=False)



NameError: name 'concat_ws' is not defined

In [46]:
'''from pyspark.sql.functions import col, explode

# Flatten the 'urls' array of structs
urls_df = new_df.select(
    col("person_id_pc"),
    explode("urls").alias("url_struct")
).select(
    col("person_id_pc"),
    col("url_struct.url_type").alias("url_type"),
    col("url_struct.url").alias("url")
)

# Flatten the 'employments' array of structs
employments_df = new_df.select(
    col("person_id_pc"),
    explode("employments").alias("employment_struct")
).select(
    col("person_id_pc"),
    col("employment_struct.company_id").alias("company_id_emp"),
    col("employment_struct.company_name").alias("company_name_emp"),
    col("employment_struct.seniority_level").alias("seniority_level_emp"),
    col("employment_struct.title").alias("employment_title_emp"),
    col("employment_struct.started_on").alias("employment_started_on_emp"),
    col("employment_struct.ended_on").alias("employment_ended_on_emp")
)

# Flatten the 'institute' array of structs
institute_df = new_df.select(
    col("person_id_pc"),
    explode("institute").alias("institute_struct")
).select(
    col("institute_struct.institution_id").alias("institution_id_inst"),
    col("institute_struct.institution_name").alias("institution_name_inst"),
    col("institute_struct.degree").alias("degree_inst"),
    col("institute_struct.subject").alias("subject_inst"),
    col("institute_struct.started_on").alias("institute_started_on_inst"),
    col("institute_struct.ended_on").alias("institute_ended_on_inst")
)

# Join all the flattened DataFrames
flattened_df = new_df.join(urls_df, on="person_id_pc", how="left") \
    .join(employments_df, on=["person_id_pc"], how="left") \
    .join(institute_df, on=["person_id_pc"], how="left")

# Write flattened DataFrame to CSV
flattened_df.coalesce(1).write.csv("output.csv", header=True)'''



'from pyspark.sql.functions import col, explode\n\n# Flatten the \'urls\' array of structs\nurls_df = new_df.select(\n    col("person_id_pc"),\n    explode("urls").alias("url_struct")\n).select(\n    col("person_id_pc"),\n    col("url_struct.url_type").alias("url_type"),\n    col("url_struct.url").alias("url")\n)\n\n# Flatten the \'employments\' array of structs\nemployments_df = new_df.select(\n    col("person_id_pc"),\n    explode("employments").alias("employment_struct")\n).select(\n    col("person_id_pc"),\n    col("employment_struct.company_id").alias("company_id_emp"),\n    col("employment_struct.company_name").alias("company_name_emp"),\n    col("employment_struct.seniority_level").alias("seniority_level_emp"),\n    col("employment_struct.title").alias("employment_title_emp"),\n    col("employment_struct.started_on").alias("employment_started_on_emp"),\n    col("employment_struct.ended_on").alias("employment_ended_on_emp")\n)\n\n# Flatten the \'institute\' array of structs\ninst