In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, concat_ws, row_number, desc
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import pandas as pd
import os
import json

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName('aws_connection') \
    .getOrCreate()

In [3]:
# Define the base path
base_path = '/home/jovyan/work/data_files/'

# Read CSV files into DataFrames
people_df = pd.read_csv(base_path + 'people.csv')
person_customer_df = pd.read_csv(base_path + 'person_customers.csv')
person_educations_df = pd.read_csv(base_path + 'person_educations.csv')
person_employments_df = pd.read_csv(base_path + 'person_employments.csv')
person_social_urls_df = pd.read_csv(base_path + 'person_social_urls.csv')
company_sectors_df = pd.read_csv(base_path + 'company_sectors.csv')
company_headcounts_df = pd.read_csv(base_path + 'company_headcounts.csv')
company_annual_revenues_df = pd.read_csv(base_path + 'company_annual_revenues.csv')
company_stock_tickers_df = pd.read_csv(base_path + 'company_stock_tickers.csv')
company_funding_rounds_df = pd.read_csv(base_path + 'company_funding_rounds.csv')


In [4]:
company_funding_rounds_df.columns 

Index(['id', 'company_id', 'amount_usd', 'name', 'investor', 'date',
       'created_at', 'updated_at', 'investor_person_id',
       'investor_company_id'],
      dtype='object')

In [5]:
people_spark_df=spark.createDataFrame(people_df)
person_customer_spark_df=spark.createDataFrame(person_customer_df)
person_educations_spark_df=spark.createDataFrame(person_educations_df)
person_employments_spark_df=spark.createDataFrame(person_employments_df)
person_social_urls_spark_df=spark.createDataFrame(person_social_urls_df)
company_sectors_spark_df=spark.createDataFrame(company_sectors_df)
company_headcounts_spark_df=spark.createDataFrame(company_headcounts_df)
company_annual_revenues_spark_df=spark.createDataFrame(company_annual_revenues_df)
company_stock_tickers_spark_df=spark.createDataFrame(company_stock_tickers_df)
company_funding_rounds_spark_df=spark.createDataFrame(company_funding_rounds_df)

In [6]:
people_spark_df.show(1,truncate=False)

+---------+---------------+---------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------+----+------+-----------+-------+------------+---------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+----------------------------------+----------+--------+----------+-------------------------+----------------------+
|id       |name           |linkedin_url                                       |photo_url                                                           

In [7]:
person_customer_spark_df.show(1,truncate=False)

+---+---------+------------------------------------+--------------------------+--------------------------+
|id |person_id|customer_id                         |created_at                |updated_at                |
+---+---------+------------------------------------+--------------------------+--------------------------+
|1  |693540132|14b76a2f-4545-4fb7-92e9-ecd3a5f49a4b|2023-12-18 11:12:06.004224|2023-12-18 11:12:06.004224|
+---+---------+------------------------------------+--------------------------+--------------------------+
only showing top 1 row



In [8]:
person_educations_spark_df.show(1,truncate=False)

+-----+---------+--------------+---------------------+-----------------------------+-------+----------+----------+--------------------------+----------+
|id   |person_id|institution_id|institution_name     |degree                       |subject|started_on|ended_on  |created_at                |updated_at|
+-----+---------+--------------+---------------------+-----------------------------+-------+----------+----------+--------------------------+----------+
|12091|64876372 |NaN           |University Of Windsor|Bachelors;Bachelor Of Science|Geology|1980-01-01|1984-01-01|2023-12-11 06:18:37.445885|NaN       |
+-----+---------+--------------+---------------------+-----------------------------+-------+----------+----------+--------------------------+----------+
only showing top 1 row



In [9]:
person_employments_spark_df.show(1,truncate=False)

+-----+---------+----------+------------+---------------+-------------------------+----------+--------+--------------------------+----------+
|id   |person_id|company_id|company_name|seniority_level|title                    |started_on|ended_on|created_at                |updated_at|
+-----+---------+----------+------------+---------------+-------------------------+----------+--------+--------------------------+----------+
|78981|693540132|NaN       |NaN         |NaN            |Professional Photographer|2018-04-01|NaN     |2023-12-11 06:18:49.396966|NaN       |
+-----+---------+----------+------------+---------------+-------------------------+----------+--------+--------------------------+----------+
only showing top 1 row



In [10]:
person_social_urls_spark_df.show(1,truncate=False)

+-----+---------+---------------------------------------------------+--------+-------------------------+----------+
|id   |person_id|url                                                |url_type|created_at               |updated_at|
+-----+---------+---------------------------------------------------+--------+-------------------------+----------+
|12791|693540132|https://www.linkedin.com/in/lika-razac-ince-2044385|linkedin|2023-12-11 06:18:57.46109|NaN       |
+-----+---------+---------------------------------------------------+--------+-------------------------+----------+
only showing top 1 row



In [11]:
company_sectors_spark_df.show(1,truncate=False)

+--------+----------+--------+-------------------------+----------+
|id      |company_id|sector  |created_at               |updated_at|
+--------+----------+--------+-------------------------+----------+
|13733483|1660      |Hardware|2023-12-29 05:22:34.30253|NaN       |
+--------+----------+--------+-------------------------+----------+
only showing top 1 row



In [12]:
company_headcounts_spark_df.show(1,truncate=False)

+-------+----------+---------+----------+--------------------------+----------+
|id     |company_id|headcount|date      |created_at                |updated_at|
+-------+----------+---------+----------+--------------------------+----------+
|3214741|1660      |224704   |2023-06-07|2023-12-29 05:22:34.148533|NaN       |
+-------+----------+---------+----------+--------------------------+----------+
only showing top 1 row



In [13]:
company_annual_revenues_spark_df.show(1,truncate=False)

+------+----------+----------+----------+--------------------------+----------+
|id    |company_id|amount_usd|date      |created_at                |updated_at|
+------+----------+----------+----------+--------------------------+----------+
|222385|1660      |7.983E9   |2000-01-01|2023-12-29 05:22:34.013999|NaN       |
+------+----------+----------+----------+--------------------------+----------+
only showing top 1 row



In [14]:
company_stock_tickers_spark_df.show(1,truncate=False)

+-----+----------+------------+--------------------------+----------+
|id   |company_id|stock_ticker|created_at                |updated_at|
+-----+----------+------------+--------------------------+----------+
|56464|1660      |AAPL        |2023-12-29 05:22:34.410075|NaN       |
+-----+----------+------------+--------------------------+----------+
only showing top 1 row



In [15]:
company_funding_rounds_spark_df.show(1,truncate=False)

+------+----------+----------+--------+--------+----------+--------------------------+----------+------------------+-------------------+
|id    |company_id|amount_usd|name    |investor|date      |created_at                |updated_at|investor_person_id|investor_company_id|
+------+----------+----------+--------+--------+----------+--------------------------+----------+------------------+-------------------+
|388060|1         |2.5E7     |Series C|Accel   |2008-05-19|2023-11-28 09:25:43.506321|NaN       |NaN               |7.0                |
+------+----------+----------+--------+--------+----------+--------------------------+----------+------------------+-------------------+
only showing top 1 row



In [16]:
people_df = people_spark_df
person_customer_df = person_customer_spark_df
person_educations_df = person_educations_spark_df
person_employments_df = person_employments_spark_df
person_social_urls_df = person_social_urls_spark_df
company_sectors_df = company_sectors_spark_df
company_headcounts_df = company_headcounts_spark_df
company_annual_revenues_df = company_annual_revenues_spark_df
company_stock_tickers_df = company_stock_tickers_spark_df
company_funding_rounds_df = company_funding_rounds_spark_df

In [17]:
person_customer_df

DataFrame[id: bigint, person_id: bigint, customer_id: string, created_at: string, updated_at: string]

***Function to check the uniqueness of data***

In [18]:
def check_uniqueness(df, column_name):
    # Add a new column 'is_duplicate' that flags if column_name is duplicated
    df_duplicates_check = df.withColumn('is_duplicate', count(column_name).over(Window.partitionBy(column_name)) > 1)

    # If any 'is_duplicate' is True, then DataFrame is not unique based on column_name
    if df_duplicates_check.filter(col('is_duplicate')).count() > 0:
        print(f"DataFrame is not unique based on {column_name}")
    else:
        print(f"DataFrame is unique based on {column_name}")

In [19]:
check_uniqueness(people_df, 'id')

DataFrame is unique based on id


***Function to check the counts of data***

In [20]:
def count_records(df):
    """
    This function counts the number of records in a DataFrame.

    Parameters:
    df (DataFrame): The DataFrame for which to count the records.

    Returns:
    int: The number of records in the DataFrame.
    """
    # Use the count() function to count the number of rows in the DataFrame
    num_records = df.count()

    # Return the number of records
    return num_records

***People Table Transformation***

In [21]:
# Select specific columns from the 'people_df' DataFrame, rename the 'id' column to 'person_id',
# and concatenate 'address', 'city', 'region', 'postal_code', 'country' columns with a comma separator
people_selected_df = people_df.select('id', 'name', 'address', 'city', 'region', 'postal_code', 'country', 'headline','description', 'created_at', 'updated_at')\
                              .withColumnRenamed('id', 'person_id')\
                              .withColumn('address', concat_ws(', ', 'address', 'city', 'region', 'postal_code', 'country'))

people_selected_df = people_selected_df.select('person_id', 'name', 'address','headline','description', 'created_at', 'updated_at')

# people_selected_df.show()

In [22]:
count_records(people_selected_df)

999

***Transformation for the "person_customer" table***

In [23]:
count_records(person_customer_df)

# person_customer_df.show()

166

In [24]:
check_uniqueness(person_customer_df,'person_id')

DataFrame is not unique based on person_id


In [25]:
# Group by 'person_id' and count the number of occurrences of each 'person_id'
duplicate_rows = person_customer_df.groupBy('person_id').agg(count('*').alias('count'))

# Filter the rows where 'count' is greater than 1 (i.e., 'person_id' is duplicated)
duplicate_rows = duplicate_rows.filter(duplicate_rows['count'] > 1)

# Show the duplicate rows
duplicate_rows.show()
count_records(duplicate_rows)

+---------+-----+
|person_id|count|
+---------+-----+
|693540132|    2|
+---------+-----+



1

In [26]:
# Define a window partitioned by person_id and ordered by updated_at in descending order
window = Window.partitionBy("person_id").orderBy(desc("updated_at"))

# Add a row_number column to the DataFrame
person_customer_df = person_customer_df.withColumn("rn", row_number().over(window))

# Filter the DataFrame to keep only the rows with rn = 1 (i.e., the latest updated_at for each person_id)
person_customer_df = person_customer_df.filter(person_customer_df.rn == 1)

# Drop the rn column
person_customer_df = person_customer_df.drop("rn")

count_records(person_customer_df)

165

In [27]:
# Call the 'check_uniqueness' function on 'person_customer_df' DataFrame to check if 'person_id' is unique
check_uniqueness(person_customer_df,'person_id')

DataFrame is unique based on person_id


In [28]:
# Select 'person_id' and 'customer_id' columns from 'person_customer_df' DataFrame
people_selected_customer_df = person_customer_df.select('person_id', 'customer_id')

# Join 'people_selected_df' with 'people_selected_customer_df' on 'person_id'
people_person_customer_df = people_selected_df.join(people_selected_customer_df, on='person_id', how='left')

# people_person_customer_df.show()

In [29]:
count_records(people_person_customer_df)

999

***Transformation for "Person Education" table***

In [30]:
count_records(person_educations_df)
# person_educations_df.show()

1209

In [31]:
# Define a struct for each institute
institute_struct = F.struct(
    "institution_id", "institution_name", "degree", "subject", "started_on", "ended_on"
)

# Group by person_id and collect list of institute details as structs
grouped_educations_df = person_educations_df.groupBy("person_id").agg(
    F.collect_list(institute_struct).alias("education")
    # ("education - (institution_id, institution_name, degree, subject, started_on, ended_on)")
)

person_education_group_df = grouped_educations_df



# Show the result
# person_education_group_df.show(truncate=False)


In [32]:
check_uniqueness(person_education_group_df, 'person_id')

DataFrame is unique based on person_id


In [33]:
count_records(person_education_group_df)

500

In [34]:
# Joining person_education_group_df and people_person_customer_df DataFrames on column named 'person_id'

people_person_customer_education_df = people_person_customer_df.join(person_education_group_df, on='person_id', how='left') 

# people_person_customer_education_df.show()

count_records(people_person_customer_education_df)

999

***Transformation for Company related information***

In [35]:
count_records(company_sectors_df)

5711

In [36]:
# Group the DataFrame 'company_sectors_df' by 'company_id'
grouped_company_sectors_df = company_sectors_df.groupBy('company_id').agg(
    F.collect_list('sector').alias('sectors')
)

# Check the DataFrame 'grouped_company_sectors_df'
# grouped_company_sectors_df.show()

count_records(grouped_company_sectors_df)

1000

In [37]:
count_records(company_annual_revenues_df)


18532

In [38]:
# Define a window partitioned by 'company_id' and ordered by 'date' in descending order
window = Window.partitionBy('company_id').orderBy(F.desc('date'))

# Add a new column 'rank' to the DataFrame 'company_annual_revenues_df'
# The 'rank' is calculated over the defined window
grouped_company_annual_revenues_df = company_annual_revenues_df.withColumn('rank', F.rank().over(window))

# Filter the DataFrame to keep only the rows where 'rank' is 1
# This gives us the latest annual revenue for each company
# Select only the 'company_id' and 'amount_usd' columns for the final DataFrame
latest_company_annual_revenues_df = grouped_company_annual_revenues_df.filter(F.col('rank') == 1).select('company_id', 'amount_usd')

# Check the DataFrame 'latest_company_annual_revenues_df'
# latest_company_annual_revenues_df.show()

count_records(latest_company_annual_revenues_df)

999

In [39]:
count_records(company_headcounts_df)


1000

In [40]:
# Define a window partitioned by 'headcount' and ordered by 'date' in descending order
window = Window.partitionBy('headcount').orderBy(F.desc('date'))

# Add a new column 'rank' to the DataFrame 'company_headcounts_df'
# The 'rank' is calculated over the defined window
grouped_company_headcounts_df = company_headcounts_df.withColumn('rank', F.rank().over(window))

# Filter the DataFrame to keep only the rows where 'rank' is 1
# This gives us the latest headcount for each company
# Select only the 'company_id' and 'headcount' columns for the final DataFrame
latest_company_headcounts_df = grouped_company_headcounts_df.filter(F.col('rank') == 1).select('company_id', 'headcount')

# Check the DataFrame 'latest_company_headcounts_df'
# latest_company_headcounts_df.show()

count_records(latest_company_headcounts_df)

847

In [41]:
# Select the 'company_id' and 'stock_ticker' columns from the DataFrame 'company_stock_tickers_df'
selected_company_stock_tickers_df = company_stock_tickers_df.select('company_id','stock_ticker')
count_records(selected_company_stock_tickers_df)

1000

In [42]:
# Define a Window specification: partition by 'company_id' and order by 'company_id', 'date' (in descending order), and 'updated_at' (in descending order)
windowSpec = Window.partitionBy(company_funding_rounds_df['company_id']).orderBy(company_funding_rounds_df['company_id'], company_funding_rounds_df['date'].desc(), company_funding_rounds_df['updated_at'].desc())

# Add a new column 'rn' to the DataFrame 'company_funding_rounds_df' which contains row numbers within each window partition
grouped_company_funding_rounds_df = company_funding_rounds_df.withColumn("rn", row_number().over(windowSpec))

# Filter the DataFrame to keep only the rows with 'rn' equal to 1 (i.e., the most recent funding round for each company), then drop the 'rn' column
filtered_company_funding_rounds_df = grouped_company_funding_rounds_df.filter(col("rn") == 1).drop("rn")

# Select the columns 'company_id', 'name', 'investor_company_id', and 'investor' from the filtered DataFrame
filtered_company_funding_rounds_df.select('company_id','name', 'investor_company_id','investor')

DataFrame[company_id: bigint, name: string, investor_company_id: double, investor: string]

***Joined the company information related transformed dataframes***

In [43]:

# Join the DataFrame 'grouped_company_sectors_df' with 'latest_company_annual_revenues_df', 'latest_company_headcounts_df', and 'selected_company_stock_tickers_df' on 'company_id'
# The join type is 'left', meaning only the rows with a match in both DataFrames will be kept
company_info_joined_df = grouped_company_sectors_df.join(
    latest_company_annual_revenues_df, 'company_id', 'left'
).join(
    latest_company_headcounts_df, 'company_id', 'left'
).join(
    selected_company_stock_tickers_df, 'company_id', 'left'
).join(
    filtered_company_funding_rounds_df, 'company_id', 'left'
)

# count_records(company_info_joined_df)

# Select the 'company_id', 'amount_usd', 'headcount', and 'stock_ticker' columns from the joined DataFrame
resultant_company_info_joined_df = company_info_joined_df.select(
    'company_id',
    company_info_joined_df['name'].alias('funding_name'), 'investor_company_id','investor',
    latest_company_annual_revenues_df['amount_usd'],
    latest_company_headcounts_df['headcount'],
    selected_company_stock_tickers_df['stock_ticker']
)

# Check the DataFrame 'resultant_company_info_joined_df'
# resultant_company_info_joined_df.show()

count_records(resultant_company_info_joined_df)


1000

***Transformation for the Person Employment Table***

In [44]:
count_records(person_employments_df)
# person_employments_df.show()

7898

In [45]:
person_employments_df.columns

['id',
 'person_id',
 'company_id',
 'company_name',
 'seniority_level',
 'title',
 'started_on',
 'ended_on',
 'created_at',
 'updated_at']

In [46]:
resultant_company_info_joined_df.columns

['company_id',
 'funding_name',
 'investor_company_id',
 'investor',
 'amount_usd',
 'headcount',
 'stock_ticker']

In [47]:
# Join the DataFrames on 'company_id'
resultant_company_info_employment_joined_df = person_employments_df.join(resultant_company_info_joined_df, 'company_id', 'left')

count_records(resultant_company_info_employment_joined_df)

# Define a new struct that includes the additional columns
emplopyment_struct = F.struct(
    "company_id", "company_name", "seniority_level", "title", "started_on", "ended_on",
    resultant_company_info_joined_df['amount_usd'],
    resultant_company_info_joined_df['headcount'],
    resultant_company_info_joined_df['stock_ticker'],
    resultant_company_info_joined_df['funding_name'],
    resultant_company_info_joined_df['investor_company_id'],
    resultant_company_info_joined_df['investor']

)

# Group by 'person_id' and collect list of employment details as structs
grouped_person_employments_df = resultant_company_info_employment_joined_df.groupBy("person_id").agg(
    F.collect_list(emplopyment_struct).alias("employments")
)

grouped_person_employments_df.show(1,truncate=False)


count_records(person_employments_df)

+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

7898

In [48]:
# Join the DataFrame 'people_person_customer_education_df' with 'person_employments_df' on 'person_id'
# The resulting DataFrame contains information about a person's education and employment
people_person_customer_education_educations_df = people_person_customer_education_df.join(grouped_person_employments_df, on='person_id', how='left') 

# people_person_customer_education_educations_df.show()

count_records(people_person_customer_education_educations_df)

999

***Transformation for the Person Social Urls Table***

In [49]:
# Group by person_id and collect list of URL details
grouped_urls_df = person_social_urls_df.groupBy("person_id").agg(
    F.collect_list(
        F.struct("url_type", "url")
    ).alias("urls")
)

# Show the result
# grouped_urls_df.show(truncate=False)

count_records(grouped_urls_df)

999

In [50]:
check_uniqueness(grouped_urls_df, 'person_id')

DataFrame is unique based on person_id


In [51]:
# Joining person_education_group_df and people_person_customer_df DataFrames on column named 'person_id'

people_person_customer_education_employments_social_urls_df = people_person_customer_education_educations_df.join(grouped_urls_df, on='person_id', how='left') 

count_records(people_person_customer_education_employments_social_urls_df)


999

In [52]:
check_uniqueness(people_person_customer_education_employments_social_urls_df, 'person_id')

DataFrame is unique based on person_id


***Final Dataframe***

In [53]:
# people_person_customer_education_employments_social_urls_df.printSchema()

In [54]:
people_person_customer_education_employments_social_urls_df.show()

+----------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| person_id|               name|             address|            headline|         description|          created_at|          updated_at|         customer_id|           education|         employments|                urls|
+----------+-------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 949256266|        Vivian Weng|NaN, NaN, NaN, Na...|Product Design, S...|                 NaN|2023-12-11 06:18:...|                 NaN|7538cb11-1c23-4c7...|[{NaN, University...|[{7962172.0, Frog...|[{linkedin, https...|
| 787025923|        Lynda Zhang|NaN, NaN, NaN, Na...|Retail Ownership ...|                 NaN|2023-12-11 06:18:

***Final Dataframe can be exported to files such as CSV, JSON, Parquet***

In [55]:
# Use the exiting dataframe
final_df = people_person_customer_education_employments_social_urls_df


In [56]:
final_df.show(1,truncate=False)

+---------+---------------+-----------------------+----------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------+----------------------+------------------------------------+---------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [57]:
final_df.write.mode('overwrite').json('./work/final_df')

In [58]:
final_df.printSchema()


root
 |-- person_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- address: string (nullable = false)
 |-- headline: string (nullable = true)
 |-- description: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- updated_at: string (nullable = true)
 |-- customer_id: string (nullable = true)
 |-- education: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- institution_id: double (nullable = true)
 |    |    |-- institution_name: string (nullable = true)
 |    |    |-- degree: string (nullable = true)
 |    |    |-- subject: string (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |-- employments: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- company_id: double (nullable = true)
 |    |    |-- company_name: string (nullable = true)
 |    |    |-- seniority_level: string (nullable = true)
 |    |    |--

In [59]:
final_df.write.mode("overwrite").parquet("final_output/df4.parquet")


In [60]:
df = spark.read.parquet("final_output/df4.parquet")

In [61]:
df.show()

+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| person_id|                name|             address|            headline|         description|          created_at|          updated_at|         customer_id|           education|         employments|                urls|
+----------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| 949256266|         Vivian Weng|NaN, NaN, NaN, Na...|Product Design, S...|                 NaN|2023-12-11 06:18:...|                 NaN|7538cb11-1c23-4c7...|[{NaN, University...|[{7962172.0, Frog...|[{linkedin, https...|
| 902184389|   Hendrik Bourgeois|NaN, NaN, NaN, Na...|Head of Governmen...|                 NaN|2023-12-11 0

In [62]:
# Create an empty list to store the column expressions
column_exprs = []

# For each column in the DataFrame
for col_name in df.columns:
    # Create a new column expression that concatenates the column name and the column value
    column_exprs.append(F.concat(F.lit(col_name + ":"), df[col_name].cast("string")))

# Concatenate all column expressions into one single column 'single_text'
single_file_for_each_person_df = df.select(F.concat_ws(' ', *column_exprs).alias('candidate_details'))

single_file_for_each_person_df.show(1,truncate=False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [63]:
df = df.select('person_id', 'name', 'address', 'headline', 'description', 'education', 'employments')
df.show(2,truncate=False)

+---------+-----------------+-----------------------------+-----------------------------------------------------+-----------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [64]:
from pyspark.sql.functions import regexp_replace, concat_ws

# Define a function to clean text
def clean_text(col):
    # Remove symbols and characters except ':'
    cleaned_col = regexp_replace(col, r'[^\w\s:]', '')
    # Replace 'null' with single whitespace
    cleaned_col = regexp_replace(cleaned_col, r'\bnull\b', ' ')
    # Remove extra whitespaces
    cleaned_col = regexp_replace(cleaned_col, r'\s+', ' ')
    return cleaned_col

# Apply the clean_text function to each column
cleaned_columns = [clean_text(col).alias(col) for col in single_file_for_each_person_df.columns]

# Concatenate the cleaned columns into a new column containing plain text
plain_text_df = single_file_for_each_person_df.select(concat_ws(' ', *cleaned_columns).alias('plain_text'))

# Show the resulting DataFrame
plain_text_df.show(2,truncate=False)
plain_text_df.count()

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

999

In [65]:
# Select only the first row
first_row = plain_text_df.tail(1)

# Show the first row
print(first_row)

[Row(plain_text='person_id:56622026 name:Erin Nopp address:NaN Omaha Nebraska NaN US headline:Strategic Account Manager at Allergan Aesthetics an AbbVie Company description:My passion is simplebusiness I love all aspects of it the way it grows when fed the right ingredients the way smart processes create positive change and the checkpoints which maintain and reward progress I live in the numbers the minutiae of day to day operations and the tasks which many business owners dreadnnI have found a special balance with aesthetic professionals and learned my genuine valueoriented approach pairs well with this powerful industry I love to be behindthescenes planning promotions analyzing inventory levels increasing margins and helping business owners succeed while focusing on what they do best Nothing fills me with more joy than seeing a business owner radiating power and accomplishment due to our collaboration created_at:20231211 06:18:0813904 updated_at:20231026 15:57:2235 education:NaN Colo

In [66]:
all_rows_list = plain_text_df.collect()

# Convert all rows to dictionaries
all_rows_dicts = [row.asDict() for row in all_rows_list]

# Convert the dictionaries to a pandas DataFrame
all_rows_df = pd.DataFrame(all_rows_dicts)


# Save all rows to a single CSV file, overwriting if it already exists
all_rows_df.to_csv('input_people_data.csv', index=False, mode='w', header=True)


# Print the total number of rows
print("Total number of rows:", len(all_rows_df))
# Print the first row of the DataFrame
# Set the display options to show all columns and rows
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', None)

# Print the first row of the DataFrame
# Convert the first row to a string and print it
# Print the 'plain_text' column of the first row
print(all_rows_df.iloc[0]['plain_text'])

Total number of rows: 999
person_id:949256266 name:Vivian Weng address:NaN NaN NaN NaN SINGAPORE headline:Product Design Strategy Managment description:NaN created_at:20231211 06:18:0813904 updated_at:NaN customer_id:7538cb111c234c7687c75111a0f166dc education:NaN University of California Berkeley Walter A Haas School of Business NaN MBA Finance 20070101 20090101 NaN National Taiwan University NaN BA International Relations 19980101 20020101 NaN Yale University NaN MA International Developmental Economics 20020101 20030101 employments:79621720 Frog Director Associate Strategy Director Innovation Strategy Group 20120201 20160601 158610 DBS Bank VP Vice President Innovation Group 20160701 20190401 16600 Apple NaN Product Design Producer International Product Design 20190401 NaN 31949031445E10 224704 AAPL Private Equity 60840 Interscope Records 590 Goldman Sachs NaN Investment Banking Division Summer Associate 20080601 20080801 8417546311E9 75859 GS 40070 McKinsey Company NaN Business Anal