In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, concat_ws, row_number, desc
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import pandas as pd
import os
import json

In [6]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName('aws_connection') \
    .getOrCreate()

In [8]:
# Define the base path
base_path = '/home/jovyan/work/data_files/'

# Read CSV files into DataFrames
people_df = pd.read_csv(base_path + 'people.csv')
#person_customer_df = pd.read_csv(base_path + 'person_customers.csv')
person_educations_df = pd.read_csv(base_path + 'person_educations.csv')
person_employments_df = pd.read_csv(base_path + 'person_employments.csv')
#person_social_urls_df = pd.read_csv(base_path + 'person_social_urls.csv')
company_sectors_df = pd.read_csv(base_path + 'company_sectors.csv')
company_headcounts_df = pd.read_csv(base_path + 'company_headcounts.csv')
company_annual_revenues_df = pd.read_csv(base_path + 'company_annual_revenues.csv')
company_stock_tickers_df = pd.read_csv(base_path + 'company_stock_tickers.csv')
#company_funding_rounds_df = pd.read_csv(base_path + 'company_funding_rounds.csv')


In [11]:
people_spark_df=spark.createDataFrame(people_df)
#person_customer_spark_df=spark.createDataFrame(person_customer_df)
person_educations_spark_df=spark.createDataFrame(person_educations_df)
person_employments_spark_df=spark.createDataFrame(person_employments_df)
#person_social_urls_spark_df=spark.createDataFrame(person_social_urls_df)
company_sectors_spark_df=spark.createDataFrame(company_sectors_df)
company_headcounts_spark_df=spark.createDataFrame(company_headcounts_df)
company_annual_revenues_spark_df=spark.createDataFrame(company_annual_revenues_df)
company_stock_tickers_spark_df=spark.createDataFrame(company_stock_tickers_df)
#company_funding_rounds_spark_df=spark.createDataFrame(company_funding_rounds_df)

In [10]:
people_spark_df.show(1,truncate=False)

+---------+---------------+---------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+----+------+-----------+-------+------------+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+----------------------------------+----------+--------+----------+-------------------------+----------------------+
|id       |name           |linkedin_url                                       |photo_url                                                           

In [7]:
person_customer_spark_df.show(1,truncate=False)

+---+---------+------------------------------------+--------------------------+--------------------------+
|id |person_id|customer_id                         |created_at                |updated_at                |
+---+---------+------------------------------------+--------------------------+--------------------------+
|1  |693540132|14b76a2f-4545-4fb7-92e9-ecd3a5f49a4b|2023-12-18 11:12:06.004224|2023-12-18 11:12:06.004224|
+---+---------+------------------------------------+--------------------------+--------------------------+
only showing top 1 row



In [12]:
person_educations_spark_df.show(1,truncate=False)

+-----+---------+--------------+---------------------+-----------------------------+-------+----------+----------+--------------------------+----------+
|id   |person_id|institution_id|institution_name     |degree                       |subject|started_on|ended_on  |created_at                |updated_at|
+-----+---------+--------------+---------------------+-----------------------------+-------+----------+----------+--------------------------+----------+
|12091|64876372 |NaN           |University Of Windsor|Bachelors;Bachelor Of Science|Geology|1980-01-01|1984-01-01|2023-12-11 06:18:37.445885|NaN       |
+-----+---------+--------------+---------------------+-----------------------------+-------+----------+----------+--------------------------+----------+
only showing top 1 row



In [9]:
person_employments_spark_df.show(1,truncate=False)

+-----+---------+----------+------------+---------------+-------------------------+----------+--------+----------+
|id   |person_id|company_id|company_name|seniority_level|title                    |started_on|ended_on|Unnamed: 8|
+-----+---------+----------+------------+---------------+-------------------------+----------+--------+----------+
|78981|693540132|NaN       |NaN         |NaN            |Professional Photographer|01-04-2018|NaN     |NaN       |
+-----+---------+----------+------------+---------------+-------------------------+----------+--------+----------+
only showing top 1 row



In [10]:
person_social_urls_spark_df.show(1,truncate=False)

+-----+---------+---------------------------------------------------+--------+-------------------------+----------+
|id   |person_id|url                                                |url_type|created_at               |updated_at|
+-----+---------+---------------------------------------------------+--------+-------------------------+----------+
|12791|693540132|https://www.linkedin.com/in/lika-razac-ince-2044385|linkedin|2023-12-11 06:18:57.46109|NaN       |
+-----+---------+---------------------------------------------------+--------+-------------------------+----------+
only showing top 1 row



In [11]:
company_sectors_spark_df.show(1,truncate=False)

+----------+--------+----------+----------+
|company_id|sector  |Unnamed: 2|Unnamed: 3|
+----------+--------+----------+----------+
|1660      |Hardware|NaN       |NaN       |
+----------+--------+----------+----------+
only showing top 1 row



In [12]:
company_headcounts_spark_df.show(1,truncate=False)

+----------+---------+----------+----------+----------+
|company_id|headcount|date      |Unnamed: 3|Unnamed: 4|
+----------+---------+----------+----------+----------+
|1660      |224704   |07-06-2023|NaN       |NaN       |
+----------+---------+----------+----------+----------+
only showing top 1 row



In [13]:
company_annual_revenues_spark_df.show(1,truncate=False)

+----------+----------+----------+----------+----------+
|company_id|amount_usd|date      |Unnamed: 3|Unnamed: 4|
+----------+----------+----------+----------+----------+
|1660      |7.983E9   |01-01-2000|NaN       |NaN       |
+----------+----------+----------+----------+----------+
only showing top 1 row



In [14]:
company_stock_tickers_spark_df.show(1,truncate=False)

+-----+----------+------------+--------------------------+----------+
|id   |company_id|stock_ticker|created_at                |updated_at|
+-----+----------+------------+--------------------------+----------+
|56464|1660      |AAPL        |2023-12-29 05:22:34.410075|NaN       |
+-----+----------+------------+--------------------------+----------+
only showing top 1 row



In [15]:
company_funding_rounds_spark_df.show(1,truncate=False)

+------+----------+----------+--------+--------+----------+--------------------------+----------+------------------+-------------------+
|id    |company_id|amount_usd|name    |investor|date      |created_at                |updated_at|investor_person_id|investor_company_id|
+------+----------+----------+--------+--------+----------+--------------------------+----------+------------------+-------------------+
|388060|1         |2.5E7     |Series C|Accel   |2008-05-19|2023-11-28 09:25:43.506321|NaN       |NaN               |7.0                |
+------+----------+----------+--------+--------+----------+--------------------------+----------+------------------+-------------------+
only showing top 1 row



In [16]:
people_df = people_spark_df
person_customer_df = person_customer_spark_df
person_educations_df = person_educations_spark_df
person_employments_df = person_employments_spark_df
person_social_urls_df = person_social_urls_spark_df
company_sectors_df = company_sectors_spark_df
company_headcounts_df = company_headcounts_spark_df
company_annual_revenues_df = company_annual_revenues_spark_df
company_stock_tickers_df = company_stock_tickers_spark_df
company_funding_rounds_df = company_funding_rounds_spark_df

In [17]:
person_customer_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- person_id: long (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- updated_at: string (nullable = true)



***Function to check the uniqueness of data***

In [18]:
def check_uniqueness(df, column_name):
    # Add a new column 'is_duplicate' that flags if column_name is duplicated
    df_duplicates_check = df.withColumn('is_duplicate', count(column_name).over(Window.partitionBy(column_name)) > 1)

    # If any 'is_duplicate' is True, then DataFrame is not unique based on column_name
    if df_duplicates_check.filter(col('is_duplicate')).count() > 0:
        print(f"DataFrame is not unique based on {column_name}")
    else:
        print(f"DataFrame is unique based on {column_name}")

In [19]:
check_uniqueness(people_df, 'id')

DataFrame is unique based on id


***Function to check the counts of data***

In [20]:
def count_records(df):
    """
    This function counts the number of records in a DataFrame.

    Parameters:
    df (DataFrame): The DataFrame for which to count the records.

    Returns:
    int: The number of records in the DataFrame.
    """
    # Use the count() function to count the number of rows in the DataFrame
    num_records = df.count()

    # Return the number of records
    return num_records

***People Table Transformation***

In [21]:
# Select specific columns from the 'people_df' DataFrame, rename the 'id' column to 'person_id',
# and concatenate 'address', 'city', 'region', 'postal_code', 'country' columns with a comma separator
people_selected_df = people_df.select('id', 'name')\
                              .withColumnRenamed('id', 'person_id')
                              # .withColumn('address', concat_ws(', ', 'address', 'city', 'region', 'postal_code', 'country'))

people_selected_df = people_selected_df.select('person_id', 'name')

people_selected_df.show()

+----------+-------------------+
| person_id|               name|
+----------+-------------------+
| 693540132|    Lika Razac-Ince|
| 118418234|     Kenny Sheridan|
|  64876372|        David Coyle|
|  67121188|     Gurpreet Singh|
|  57288072|  Anthony Salvaggio|
| 747062710|      Michael Huang|
| 741629676|  Iason Bakogiannis|
| 745971556|      thomas gurney|
| 993308913|Francesco Squatrito|
| 993985187|      Eddie McGrory|
| 994637657|          Annie Cai|
|1017713620|      Sarina Studer|
| 837503042|         Clara Têtu|
| 778561577|        Adam Smythe|
| 789056453|  Federico Maggiani|
| 787025923|        Lynda Zhang|
| 856482513|        Karl Dubost|
| 949256266|        Vivian Weng|
| 122883279|          Matt Aldo|
| 124612718|      Brinda Pullen|
+----------+-------------------+
only showing top 20 rows



In [22]:

count_records(people_selected_df)

999

***Transformation for the "person_customer" table***

In [23]:
count_records(person_customer_df)

# person_customer_df.show()

166

In [24]:
check_uniqueness(person_customer_df,'person_id')

DataFrame is not unique based on person_id


In [25]:
# Group by 'person_id' and count the number of occurrences of each 'person_id'
duplicate_rows = person_customer_df.groupBy('person_id').agg(count('*').alias('count'))

# Filter the rows where 'count' is greater than 1 (i.e., 'person_id' is duplicated)
duplicate_rows = duplicate_rows.filter(duplicate_rows['count'] > 1)

# Show the duplicate rows
duplicate_rows.show()
count_records(duplicate_rows)

+---------+-----+
|person_id|count|
+---------+-----+
|693540132|    2|
+---------+-----+



1

In [26]:
# Define a window partitioned by person_id and ordered by updated_at in descending order
window = Window.partitionBy("person_id").orderBy(desc("updated_at"))

# Add a row_number column to the DataFrame
person_customer_df = person_customer_df.withColumn("rn", row_number().over(window))

# Filter the DataFrame to keep only the rows with rn = 1 (i.e., the latest updated_at for each person_id)
person_customer_df = person_customer_df.filter(person_customer_df.rn == 1)

# Drop the rn column
person_customer_df = person_customer_df.drop("rn")

count_records(person_customer_df)

165

In [27]:
# Call the 'check_uniqueness' function on 'person_customer_df' DataFrame to check if 'person_id' is unique
check_uniqueness(person_customer_df,'person_id')

DataFrame is unique based on person_id


In [28]:
# Select 'person_id' and 'customer_id' columns from 'person_customer_df' DataFrame
people_selected_customer_df = person_customer_df.select('person_id', 'customer_id')

# Join 'people_selected_df' with 'people_selected_customer_df' on 'person_id'
people_person_customer_df = people_selected_df.join(people_selected_customer_df, on='person_id', how='left')

# people_person_customer_df.show()

In [29]:
count_records(people_person_customer_df)

999

***Transformation for "Person Education" table***

In [30]:
count_records(person_educations_df)
person_educations_df.show()

+-----+---------+--------------------+--------------------+--------------------+----------+----------+----------+
|   id|person_id|    institution_name|              degree|             subject|started_on|  ended_on|Unnamed: 7|
+-----+---------+--------------------+--------------------+--------------------+----------+----------+----------+
|12091| 64876372|University Of Win...|Bachelors;Bachelo...|             Geology|01-01-1980|01-01-1984|       NaN|
|12092| 64876372|University Of Win...|Master Of Science...|             Geology|01-01-1984|01-01-1986|       NaN|
|12093| 64876372| La Trobe University|Doctorates;Doctor...|       Earth Science|01-01-1988|01-01-1992|       NaN|
|12094| 67121188|Keller Graduate S...|                 NaN|  Project Management|01-01-2013|01-01-2015|       NaN|
|12095| 67121188|Sam Houston State...|Master Of Busines...|          Management|01-01-2010|01-01-2012|       NaN|
|12096| 67121188|University School...|Bachelors;Bachelo...|                 NaN|       N

In [31]:
person_educations_df.show(1)

+-----+---------+--------------------+--------------------+-------+----------+----------+----------+
|   id|person_id|    institution_name|              degree|subject|started_on|  ended_on|Unnamed: 7|
+-----+---------+--------------------+--------------------+-------+----------+----------+----------+
|12091| 64876372|University Of Win...|Bachelors;Bachelo...|Geology|01-01-1980|01-01-1984|       NaN|
+-----+---------+--------------------+--------------------+-------+----------+----------+----------+
only showing top 1 row



In [32]:
# Define a struct for each institute
institute_struct = F.struct(
    "institution_name", "degree", "subject", "started_on", "ended_on"
)

# Group by person_id and collect list of institute details as structs
grouped_educations_df = person_educations_df.groupBy("person_id").agg(
    F.collect_list(institute_struct).alias("education")
    # ("education - (institution_id, institution_name, degree, subject, started_on, ended_on)")
)

person_education_group_df = grouped_educations_df



# Show the result
person_education_group_df.show(truncate=False)


+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|person_id|education                                                                                                                                                                                                                                                                        |
+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|101255   |[{University of Utah, BS, CS, NaN, 01-01-1982}]                                                                                    

In [33]:
check_uniqueness(person_education_group_df, 'person_id')

DataFrame is unique based on person_id


In [34]:
count_records(person_education_group_df)

500

In [35]:
# Joining person_education_group_df and people_person_customer_df DataFrames on column named 'person_id'

people_person_customer_education_df = people_person_customer_df.join(person_education_group_df, on='person_id', how='left') 

# people_person_customer_education_df.show()

count_records(people_person_customer_education_df)
people_person_customer_education_df.show(5)

+---------+-----------------+--------------------+--------------------+
|person_id|             name|         customer_id|           education|
+---------+-----------------+--------------------+--------------------+
| 57288072|Anthony Salvaggio|7538cb11-1c23-4c7...|[{Monroe Communit...|
|693540132|  Lika Razac-Ince|7538cb11-1c23-4c7...|                NULL|
|118418234|   Kenny Sheridan|7538cb11-1c23-4c7...|                NULL|
| 64876372|      David Coyle|7538cb11-1c23-4c7...|[{University Of W...|
|747062710|    Michael Huang|7538cb11-1c23-4c7...|                NULL|
+---------+-----------------+--------------------+--------------------+
only showing top 5 rows



***Transformation for Company related information***

In [36]:
count_records(company_sectors_df)

5711

In [37]:
# Group the DataFrame 'company_sectors_df' by 'company_id'
grouped_company_sectors_df = company_sectors_df.groupBy('company_id').agg(
    F.collect_list('sector').alias('sectors')
)

# Check the DataFrame 'grouped_company_sectors_df'
# grouped_company_sectors_df.show()

count_records(grouped_company_sectors_df)

1000

In [38]:
count_records(company_annual_revenues_df)


18532

In [39]:
# Define a window partitioned by 'company_id' and ordered by 'date' in descending order
window = Window.partitionBy('company_id').orderBy(F.desc('date'))

# Add a new column 'rank' to the DataFrame 'company_annual_revenues_df'
# The 'rank' is calculated over the defined window
grouped_company_annual_revenues_df = company_annual_revenues_df.withColumn('rank', F.rank().over(window))

# Filter the DataFrame to keep only the rows where 'rank' is 1
# This gives us the latest annual revenue for each company
# Select only the 'company_id' and 'amount_usd' columns for the final DataFrame
latest_company_annual_revenues_df = grouped_company_annual_revenues_df.filter(F.col('rank') == 1).select('company_id', 'amount_usd')

# Check the DataFrame 'latest_company_annual_revenues_df'
# latest_company_annual_revenues_df.show()

count_records(latest_company_annual_revenues_df)

999

In [40]:
count_records(company_headcounts_df)


1000

In [41]:
# Define a window partitioned by 'headcount' and ordered by 'date' in descending order
window = Window.partitionBy('headcount').orderBy(F.desc('date'))

# Add a new column 'rank' to the DataFrame 'company_headcounts_df'
# The 'rank' is calculated over the defined window
grouped_company_headcounts_df = company_headcounts_df.withColumn('rank', F.rank().over(window))

# Filter the DataFrame to keep only the rows where 'rank' is 1
# This gives us the latest headcount for each company
# Select only the 'company_id' and 'headcount' columns for the final DataFrame
latest_company_headcounts_df = grouped_company_headcounts_df.filter(F.col('rank') == 1).select('company_id', 'headcount')

# Check the DataFrame 'latest_company_headcounts_df'
# latest_company_headcounts_df.show()

count_records(latest_company_headcounts_df)

847

In [42]:
# Select the 'company_id' and 'stock_ticker' columns from the DataFrame 'company_stock_tickers_df'
selected_company_stock_tickers_df = company_stock_tickers_df.select('company_id','stock_ticker')
count_records(selected_company_stock_tickers_df)
selected_company_stock_tickers_df.show()

+----------+------------+
|company_id|stock_ticker|
+----------+------------+
|      1660|        AAPL|
|      1694|        TSLA|
|        13|        TWTR|
|    203437|        AMZN|
|     22598|         WMT|
|      4053|        SPOT|
|      2289|        NFLX|
|    110721|        BRKB|
|      4360|        NVDA|
|    113128|        ABBV|
|      1370|         DBK|
|     95762|        COIN|
|    206880|        SNAP|
|      3868|        TMUS|
|      3644|       CMCSA|
|     26072|         MDT|
|    325547|         NIO|
|     86996|          ZM|
|     40637|        UBER|
|     13298|          MA|
+----------+------------+
only showing top 20 rows



In [43]:
# Define a Window specification: partition by 'company_id' and order by 'company_id', 'date' (in descending order), and 'updated_at' (in descending order)
windowSpec = Window.partitionBy(company_funding_rounds_df['company_id']).orderBy(company_funding_rounds_df['company_id'], company_funding_rounds_df['date'].desc(), company_funding_rounds_df['updated_at'].desc())

# Add a new column 'rn' to the DataFrame 'company_funding_rounds_df' which contains row numbers within each window partition
grouped_company_funding_rounds_df = company_funding_rounds_df.withColumn("rn", row_number().over(windowSpec))

# Filter the DataFrame to keep only the rows with 'rn' equal to 1 (i.e., the most recent funding round for each company), then drop the 'rn' column
filtered_company_funding_rounds_df = grouped_company_funding_rounds_df.filter(col("rn") == 1).drop("rn")

# Select the columns 'company_id', 'name', 'investor_company_id', and 'investor' from the filtered DataFrame
filtered_company_funding_rounds_df.select('company_id','name', 'investor_company_id','investor')

DataFrame[company_id: bigint, name: string, investor_company_id: double, investor: string]

***Joined the company information related transformed dataframes***

In [44]:

# Join the DataFrame 'grouped_company_sectors_df' with 'latest_company_annual_revenues_df', 'latest_company_headcounts_df', and 'selected_company_stock_tickers_df' on 'company_id'
# The join type is 'left', meaning only the rows with a match in both DataFrames will be kept
company_info_joined_df = grouped_company_sectors_df.join(
    latest_company_annual_revenues_df, 'company_id', 'left'
).join(
    latest_company_headcounts_df, 'company_id', 'left'
).join(
    selected_company_stock_tickers_df, 'company_id', 'left'
).join(
    filtered_company_funding_rounds_df, 'company_id', 'left'
)

# count_records(company_info_joined_df)

# Select the 'company_id', 'amount_usd', 'headcount', and 'stock_ticker' columns from the joined DataFrame
resultant_company_info_joined_df = company_info_joined_df.select(
    'company_id',
    company_info_joined_df['name'].alias('funding_name'), 'investor_company_id','investor',
    latest_company_annual_revenues_df['amount_usd'],
    latest_company_headcounts_df['headcount'],
    selected_company_stock_tickers_df['stock_ticker']
)

# Check the DataFrame 'resultant_company_info_joined_df'
# resultant_company_info_joined_df.show()

count_records(resultant_company_info_joined_df)


1000

***Transformation for the Person Employment Table***

In [45]:
count_records(person_employments_df)
# person_employments_df.show()

7898

In [46]:
person_employments_df.columns

['id',
 'person_id',
 'company_id',
 'company_name',
 'seniority_level',
 'title',
 'started_on',
 'ended_on',
 'Unnamed: 8']

In [47]:
resultant_company_info_joined_df.columns

['company_id',
 'funding_name',
 'investor_company_id',
 'investor',
 'amount_usd',
 'headcount',
 'stock_ticker']

In [48]:
# Join the DataFrames on 'company_id'
resultant_company_info_employment_joined_df = person_employments_df.join(resultant_company_info_joined_df, 'company_id', 'left')

count_records(resultant_company_info_employment_joined_df)

# Define a new struct that includes the additional columns
emplopyment_struct = F.struct(
    "company_id", "company_name", "seniority_level", "title", "started_on", "ended_on",
    resultant_company_info_joined_df['amount_usd'],
    resultant_company_info_joined_df['headcount'],
    resultant_company_info_joined_df['stock_ticker'],
    resultant_company_info_joined_df['funding_name'],
    resultant_company_info_joined_df['investor_company_id'],
    resultant_company_info_joined_df['investor']

)

# Group by 'person_id' and collect list of employment details as structs
grouped_person_employments_df = resultant_company_info_employment_joined_df.groupBy("person_id").agg(
    F.collect_list(emplopyment_struct).alias("employments")
)

grouped_person_employments_df.show(1,truncate=False)


count_records(person_employments_df)

+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

7898

In [49]:
# Join the DataFrame 'people_person_customer_education_df' with 'person_employments_df' on 'person_id'
# The resulting DataFrame contains information about a person's education and employment
people_person_customer_education_educations_df = people_person_customer_education_df.join(grouped_person_employments_df, on='person_id', how='left') 

# people_person_customer_education_educations_df.show()

count_records(people_person_customer_education_educations_df)
people_person_customer_education_educations_df.show()

+----------+-------------------+--------------------+--------------------+--------------------+
| person_id|               name|         customer_id|           education|         employments|
+----------+-------------------+--------------------+--------------------+--------------------+
| 949256266|        Vivian Weng|7538cb11-1c23-4c7...|[{University of C...|[{7962172.0, Frog...|
| 787025923|        Lynda Zhang|7538cb11-1c23-4c7...|                NULL|[{NaN, NaN, Manag...|
| 837503042|         Clara Têtu|7538cb11-1c23-4c7...|                NULL|[{400708.0, Clari...|
| 789056453|  Federico Maggiani|7538cb11-1c23-4c7...|                NULL|[{40487.0, MindSh...|
|  57288072|  Anthony Salvaggio|7538cb11-1c23-4c7...|[{Monroe Communit...|[{1660.0, Apple, ...|
| 693540132|    Lika Razac-Ince|7538cb11-1c23-4c7...|                NULL|[{NaN, NaN, NaN, ...|
|1017713620|      Sarina Studer|7538cb11-1c23-4c7...|                NULL|[{NaN, NaN, NaN, ...|
| 856482513|        Karl Dubost|7538cb11

***Transformation for the Person Social Urls Table***

In [50]:
# Group by person_id and collect list of URL details
grouped_urls_df = person_social_urls_df.groupBy("person_id").agg(
    F.collect_list(
        F.struct("url_type", "url")
    ).alias("urls")
)

# Show the result
# grouped_urls_df.show(truncate=False)

count_records(grouped_urls_df)

999

In [51]:
check_uniqueness(grouped_urls_df, 'person_id')

DataFrame is unique based on person_id


In [52]:
# Joining person_education_group_df and people_person_customer_df DataFrames on column named 'person_id'

people_person_customer_education_employments_social_urls_df = people_person_customer_education_educations_df.join(grouped_urls_df, on='person_id', how='left') 

count_records(people_person_customer_education_employments_social_urls_df)
people_person_customer_education_employments_social_urls_df.show()

+----------+-------------------+--------------------+--------------------+--------------------+--------------------+
| person_id|               name|         customer_id|           education|         employments|                urls|
+----------+-------------------+--------------------+--------------------+--------------------+--------------------+
| 949256266|        Vivian Weng|7538cb11-1c23-4c7...|[{University of C...|[{7962172.0, Frog...|[{linkedin, https...|
| 787025923|        Lynda Zhang|7538cb11-1c23-4c7...|                NULL|[{NaN, NaN, Manag...|[{linkedin, https...|
| 837503042|         Clara Têtu|7538cb11-1c23-4c7...|                NULL|[{400708.0, Clari...|[{linkedin, https...|
| 789056453|  Federico Maggiani|7538cb11-1c23-4c7...|                NULL|[{40487.0, MindSh...|[{linkedin, https...|
|  57288072|  Anthony Salvaggio|7538cb11-1c23-4c7...|[{Monroe Communit...|[{1660.0, Apple, ...|[{linkedin, https...|
| 693540132|    Lika Razac-Ince|7538cb11-1c23-4c7...|           

In [53]:
check_uniqueness(people_person_customer_education_employments_social_urls_df, 'person_id')

DataFrame is unique based on person_id


***Final Dataframe***

In [54]:

people_person_customer_education_employments_social_urls_df.printSchema()

root
 |-- person_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- education: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- institution_name: string (nullable = true)
 |    |    |-- degree: string (nullable = true)
 |    |    |-- subject: string (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |-- employments: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- company_id: double (nullable = true)
 |    |    |-- company_name: string (nullable = true)
 |    |    |-- seniority_level: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |    |    |-- amount_usd: double (nullable = true)
 |    |    |-- headcount: long (nullable = true)
 |    |    |-- stock_ticker

In [55]:
people_person_customer_education_employments_social_urls_df.show()

+----------+-------------------+--------------------+--------------------+--------------------+--------------------+
| person_id|               name|         customer_id|           education|         employments|                urls|
+----------+-------------------+--------------------+--------------------+--------------------+--------------------+
| 949256266|        Vivian Weng|7538cb11-1c23-4c7...|[{University of C...|[{7962172.0, Frog...|[{linkedin, https...|
| 787025923|        Lynda Zhang|7538cb11-1c23-4c7...|                NULL|[{NaN, NaN, Manag...|[{linkedin, https...|
| 837503042|         Clara Têtu|7538cb11-1c23-4c7...|                NULL|[{400708.0, Clari...|[{linkedin, https...|
| 789056453|  Federico Maggiani|7538cb11-1c23-4c7...|                NULL|[{40487.0, MindSh...|[{linkedin, https...|
|  57288072|  Anthony Salvaggio|7538cb11-1c23-4c7...|[{Monroe Communit...|[{1660.0, Apple, ...|[{linkedin, https...|
| 693540132|    Lika Razac-Ince|7538cb11-1c23-4c7...|           

***Final Dataframe can be exported to files such as CSV, JSON, Parquet***

In [56]:
# Use the exiting dataframe
final_df = people_person_customer_education_employments_social_urls_df


In [57]:
final_df.show()

+----------+-------------------+--------------------+--------------------+--------------------+--------------------+
| person_id|               name|         customer_id|           education|         employments|                urls|
+----------+-------------------+--------------------+--------------------+--------------------+--------------------+
| 949256266|        Vivian Weng|7538cb11-1c23-4c7...|[{University of C...|[{7962172.0, Frog...|[{linkedin, https...|
| 787025923|        Lynda Zhang|7538cb11-1c23-4c7...|                NULL|[{NaN, NaN, Manag...|[{linkedin, https...|
| 837503042|         Clara Têtu|7538cb11-1c23-4c7...|                NULL|[{400708.0, Clari...|[{linkedin, https...|
| 789056453|  Federico Maggiani|7538cb11-1c23-4c7...|                NULL|[{40487.0, MindSh...|[{linkedin, https...|
|  57288072|  Anthony Salvaggio|7538cb11-1c23-4c7...|[{Monroe Communit...|[{1660.0, Apple, ...|[{linkedin, https...|
| 693540132|    Lika Razac-Ince|7538cb11-1c23-4c7...|           

In [58]:
final_df.write.mode('overwrite').json('./work/final_df')

In [59]:
final_df.printSchema()


root
 |-- person_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- education: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- institution_name: string (nullable = true)
 |    |    |-- degree: string (nullable = true)
 |    |    |-- subject: string (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |-- employments: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- company_id: double (nullable = true)
 |    |    |-- company_name: string (nullable = true)
 |    |    |-- seniority_level: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |    |    |-- amount_usd: double (nullable = true)
 |    |    |-- headcount: long (nullable = true)
 |    |    |-- stock_ticker

In [60]:
final_df.write.mode("overwrite").parquet("final_output/df4.parquet")



In [61]:
df = spark.read.parquet("final_output/df4.parquet")

In [62]:
df.show()

+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
| person_id|                name|         customer_id|           education|         employments|                urls|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 949256266|         Vivian Weng|7538cb11-1c23-4c7...|[{University of C...|[{7962172.0, Frog...|[{linkedin, https...|
| 902184389|   Hendrik Bourgeois|7538cb11-1c23-4c7...|                NULL|[{1660.0, Apple, ...|[{linkedin, https...|
|  48171849|     Courtney Turner|7538cb11-1c23-4c7...|[{The University ...|[{1660.0, Apple, ...|[{linkedin, https...|
|  74707094|       Katie Richter|7538cb11-1c23-4c7...|[{Rhodes College,...|[{6517068.0, Acti...|[{linkedin, https...|
|1042153985| Victor Merced-Felix|                NULL|                NULL|[{263757.0, VXI G...|[{linkedin, https...|
| 787025923|         Lynda Zhang|7538cb11-1c23-4c7...|  

In [63]:
# Create an empty list to store the column expressions
column_exprs = []

# For each column in the DataFrame
for col_name in df.columns:
    # Create a new column expression that concatenates the column name and the column value
    column_exprs.append(F.concat(F.lit(col_name + ":"), df[col_name].cast("string")))

# Concatenate all column expressions into one single column 'single_text'
single_file_for_each_person_df = df.select(F.concat_ws(' ', *column_exprs).alias('candidate_details'))

single_file_for_each_person_df.show(1,truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [64]:
df.show()


+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
| person_id|                name|         customer_id|           education|         employments|                urls|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 949256266|         Vivian Weng|7538cb11-1c23-4c7...|[{University of C...|[{7962172.0, Frog...|[{linkedin, https...|
| 902184389|   Hendrik Bourgeois|7538cb11-1c23-4c7...|                NULL|[{1660.0, Apple, ...|[{linkedin, https...|
|  48171849|     Courtney Turner|7538cb11-1c23-4c7...|[{The University ...|[{1660.0, Apple, ...|[{linkedin, https...|
|  74707094|       Katie Richter|7538cb11-1c23-4c7...|[{Rhodes College,...|[{6517068.0, Acti...|[{linkedin, https...|
|1042153985| Victor Merced-Felix|                NULL|                NULL|[{263757.0, VXI G...|[{linkedin, https...|
| 787025923|         Lynda Zhang|7538cb11-1c23-4c7...|  

In [65]:
df = df.select('person_id', 'name', 'education', 'employments')
df.show(2,truncate=False)

+---------+-----------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [66]:
from pyspark.sql.functions import regexp_replace, concat_ws

# Define a function to clean text
def clean_text(col):
    # Remove symbols and characters except ':'
    cleaned_col = regexp_replace(col, r'[^\w\s:]', '')
    # Replace 'null' with single whitespace
    cleaned_col = regexp_replace(cleaned_col, r'\bnull\b', ' ')
    # Remove extra whitespaces
    cleaned_col = regexp_replace(cleaned_col, r'\s+', ' ')
    return cleaned_col

# Apply the clean_text function to each column
cleaned_columns = [clean_text(col).alias(col) for col in single_file_for_each_person_df.columns]

# Concatenate the cleaned columns into a new column containing plain text
plain_text_df = single_file_for_each_person_df.select(concat_ws(' ', *cleaned_columns).alias('plain_text'))

# Show the resulting DataFrame
plain_text_df.show(2,truncate=False)
plain_text_df.count()

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|plain_text           

999

In [67]:
# Select only the first row
first_row = plain_text_df.tail(1)

# Show the first row
print(first_row)

[Row(plain_text='person_id:56622026 name:Erin Nopp education:Colorado State University Global BachelorsBachelor Of Science Business AdministrationBusiness ManagementBusiness Administration And ManagementManagement NaN 01012020 Creighton UniversityS Heider College Of Business Master Of Business AdministrationMasters NaN 01012021 01012023 employments:5528930 Jan Marini Skin Research NaN Account Development Manager 01052021 01112021 93109880 Osmosis Beauty NaN Account Executive at Osmosis Beauty 01022018 01032020 11097430 Glo Skin Beauty NaN Sales Consultant 01102011 01022013 786260 Este Lauder NaN Regional Makeup Artist 01012005 01012007 6099765445E9 24702 EL 3522080 Dermalogica NaN Senior Business Consultant 01022013 01012018 13979010 Hammonds Candies Director Director of National Sales 01122007 01092011 1131280 AbbVie NaN Strategic Account Manager 01112021 NaN 10603963036E10 51174 ABBV 69330710 BioTherapeutic NaN Corporate Sales Consultant 01052020 01052021 urls:linkedin https:wwwlinke

In [68]:
all_rows_list = plain_text_df.collect()

# Convert all rows to dictionaries
all_rows_dicts = [row.asDict() for row in all_rows_list]

# Convert the dictionaries to a pandas DataFrame
all_rows_df = pd.DataFrame(all_rows_dicts)


# Save all rows to a single CSV file, overwriting if it already exists
all_rows_df.to_csv('input_people_data.csv', index=False, mode='w', header=True)


# Print the total number of rows
print("Total number of rows:", len(all_rows_df))
# Print the first row of the DataFrame
# Set the display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

# Print the first row of the DataFrame
# Convert the first row to a string and print it
# Print the 'plain_text' column of the first row
print(all_rows_df.iloc[0]['plain_text'])

Total number of rows: 999
person_id:949256266 name:Vivian Weng customer_id:7538cb111c234c7687c75111a0f166dc education:University of California Berkeley Walter A Haas School of Business NaN MBA Finance 01012007 01012009 National Taiwan University NaN BA International Relations 01011998 01012002 Yale University NaN MA International Developmental Economics 01012002 01012003 employments:79621720 Frog Director Associate Strategy Director Innovation Strategy Group 01022012 01062016 158610 DBS Bank VP Vice President Innovation Group 01072016 01042019 16600 Apple NaN Product Design Producer International Product Design 01042019 NaN 31949031445E10 224704 AAPL Private Equity 60840 Interscope Records 590 Goldman Sachs NaN Investment Banking Division Summer Associate 01062008 01082008 8417546311E9 75859 GS 40070 McKinsey Company NaN Business Analyst 01012004 01012007 40070 McKinsey Company NaN Senior Associate Corporate Finance 01092009 01112011 urls:linkedin https:wwwlinkedincominvivianweng4a0b88