In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import count, col, concat_ws, row_number, desc
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
import pandas as pd
import os
import json

In [2]:
# Initialize SparkSession
spark = SparkSession.builder \
    .appName('aws_connection') \
    .getOrCreate()

In [3]:
# Preparing Master Pyspark DataFrame, Will have all the Features, 
# and the respective prompts will have specific features 

# Define the base path
base_path = '/home/jovyan/work/data_files/'

# Read CSV files into DataFrames
people_df = pd.read_csv(base_path + 'people.csv')
person_educations_df = pd.read_csv(base_path + 'person_educations.csv')
person_employments_df = pd.read_csv(base_path + 'person_employments.csv')
company_sectors_df = pd.read_csv(base_path + 'company_sectors.csv')
company_headcounts_df = pd.read_csv(base_path + 'company_headcounts.csv')
company_annual_revenues_df = pd.read_csv(base_path + 'company_annual_revenues.csv')



In [4]:
people_spark_df=spark.createDataFrame(people_df)
person_educations_spark_df=spark.createDataFrame(person_educations_df)
person_employments_spark_df=spark.createDataFrame(person_employments_df)
company_sectors_spark_df=spark.createDataFrame(company_sectors_df)
company_headcounts_spark_df=spark.createDataFrame(company_headcounts_df)
company_annual_revenues_spark_df=spark.createDataFrame(company_annual_revenues_df)


In [5]:
people_df = people_spark_df
person_educations_df = person_educations_spark_df
person_employments_df = person_employments_spark_df
company_sectors_df = company_sectors_spark_df
company_headcounts_df = company_headcounts_spark_df
company_annual_revenues_df = company_annual_revenues_spark_df

***Function to check the uniqueness of data***

In [6]:
def check_uniqueness(df, column_name):
    # Add a new column 'is_duplicate' that flags if column_name is duplicated
    df_duplicates_check = df.withColumn('is_duplicate', count(column_name).over(Window.partitionBy(column_name)) > 1)

    # If any 'is_duplicate' is True, then DataFrame is not unique based on column_name
    if df_duplicates_check.filter(col('is_duplicate')).count() > 0:
        print(f"DataFrame is not unique based on {column_name}")
    else:
        print(f"DataFrame is unique based on {column_name}")

In [7]:
check_uniqueness(people_df, 'id')

DataFrame is unique based on id


***Function to check the counts of data***

In [8]:
def count_records(df):
    """
    This function counts the number of records in a DataFrame.

    Parameters:
    df (DataFrame): The DataFrame for which to count the records.

    Returns:
    int: The number of records in the DataFrame.
    """
    # Use the count() function to count the number of rows in the DataFrame
    num_records = df.count()

    # Return the number of records
    return num_records

***People Table Transformation***

In [49]:
# Select specific columns from the 'people_df' DataFrame, rename the 'id' column to 'person_id',
# and concatenate the 'address', 'city', 'region', 'postal_code', and 'country' columns with a comma separator
people_selected_df = people_df.select('id', 'name')\
                              .withColumnRenamed('id', 'person_id')
                              

people_selected_df = people_selected_df.select('person_id', 'name')

# people_selected_df.show()

In [50]:
count_records(people_selected_df)
#people_selected_df.show(1)

999

***Transformation for the "person_customer" table***

***Transformation for "Person Education" table***

In [51]:
count_records(person_educations_df)
# person_educations_df.show()

1209

In [52]:
# Define a struct for each institute
institute_struct = F.struct(
     "institution_name", "degree", "subject", "started_on", "ended_on"
)

# Group by person_id and collect list of institute details as structs
grouped_educations_df = person_educations_df.groupBy("person_id").agg(
    F.collect_list(institute_struct).alias("education")
    # ("education - (institution_id, institution_name, degree, subject, started_on, ended_on)")
)

person_education_group_df = grouped_educations_df



# Show the result
# person_education_group_df.show(truncate=False)


In [53]:
check_uniqueness(person_education_group_df, 'person_id')
#person_education_group_df.printSchema()

DataFrame is unique based on person_id


In [54]:
# Joining person_education_group_df and people_person_customer_df DataFrames on column named 'person_id'

people_person_customer_education_df = people_selected_df.join(person_education_group_df, on='person_id', how='left') 

# people_person_customer_education_df.show()

count_records(people_person_customer_education_df)

999

***Transformation for Company related information***

In [55]:
count_records(company_sectors_df)

5711

In [56]:
# Group the DataFrame 'company_sectors_df' by 'company_id'
grouped_company_sectors_df = company_sectors_df.groupBy('company_id').agg(
    F.collect_list('sector').alias('sectors')
)

# Check the DataFrame 'grouped_company_sectors_df'
# grouped_company_sectors_df.show()

count_records(grouped_company_sectors_df)
grouped_company_sectors_df.printSchema()

root
 |-- company_id: long (nullable = true)
 |-- sectors: array (nullable = false)
 |    |-- element: string (containsNull = false)



In [57]:
count_records(company_annual_revenues_df)


18532

In [58]:
# Define a window partitioned by 'company_id' and ordered by 'date' in descending order
window = Window.partitionBy('company_id').orderBy(F.desc('date'))

# Add a new column 'rank' to the DataFrame 'company_annual_revenues_df'
# The 'rank' is calculated over the defined window
grouped_company_annual_revenues_df = company_annual_revenues_df.withColumn('rank', F.rank().over(window))

# Filter the DataFrame to keep only the rows where 'rank' is 1
# This gives us the latest annual revenue for each company
# Select only the 'company_id' and 'amount_usd' columns for the final DataFrame
latest_company_annual_revenues_df = grouped_company_annual_revenues_df.filter(F.col('rank') == 1).select('company_id', 'amount_usd')

# Check the DataFrame 'latest_company_annual_revenues_df'
# latest_company_annual_revenues_df.show()

count_records(latest_company_annual_revenues_df)

999

In [59]:
count_records(company_headcounts_df)


1000

In [60]:
# Define a window partitioned by 'headcount' and ordered by 'date' in descending order
window = Window.partitionBy('headcount').orderBy(F.desc('date'))

# Add a new column 'rank' to the DataFrame 'company_headcounts_df'
# The 'rank' is calculated over the defined window
grouped_company_headcounts_df = company_headcounts_df.withColumn('rank', F.rank().over(window))

# Filter the DataFrame to keep only the rows where 'rank' is 1
# This gives us the latest headcount for each company
# Select only the 'company_id' and 'headcount' columns for the final DataFrame
latest_company_headcounts_df = grouped_company_headcounts_df.filter(F.col('rank') == 1).select('company_id', 'headcount')

# Check the DataFrame 'latest_company_headcounts_df'
# latest_company_headcounts_df.show()

count_records(latest_company_headcounts_df)

847

***Joined the company information related transformed dataframes***

In [61]:

# Join the DataFrame 'grouped_company_sectors_df' with 'latest_company_annual_revenues_df', 'latest_company_headcounts_df', and 'selected_company_stock_tickers_df' on 'company_id'
# The join type is 'left', meaning only the rows with a match in both DataFrames will be kept
company_info_joined_df = grouped_company_sectors_df.join(
    latest_company_annual_revenues_df, 'company_id', 'left'
).join(
    latest_company_headcounts_df, 'company_id', 'left'
)

# count_records(company_info_joined_df)

# Select the 'company_id', 'amount_usd', 'headcount', and 'stock_ticker' columns from the joined DataFrame
resultant_company_info_joined_df = company_info_joined_df.select(
    'company_id',
    latest_company_annual_revenues_df['amount_usd'],
    latest_company_headcounts_df['headcount'],
)

# Check the DataFrame 'resultant_company_info_joined_df'
# resultant_company_info_joined_df.show()

count_records(resultant_company_info_joined_df)


1000

***Transformation for the Person Employment Table***

In [62]:
person_employments_df.printSchema()

root
 |-- id: long (nullable = true)
 |-- person_id: long (nullable = true)
 |-- company_id: double (nullable = true)
 |-- company_name: string (nullable = true)
 |-- seniority_level: string (nullable = true)
 |-- title: string (nullable = true)
 |-- position_description: double (nullable = true)
 |-- started_on: string (nullable = true)
 |-- ended_on: string (nullable = true)
 |-- created_at: string (nullable = true)
 |-- updated_at: double (nullable = true)



In [63]:
# Join the DataFrames on 'company_id'
resultant_company_info_employment_joined_df = person_employments_df.join(resultant_company_info_joined_df, 'company_id', 'left')

count_records(resultant_company_info_employment_joined_df)

# Define a new struct that includes the additional columns
emplopyment_struct = F.struct(
    "company_id", "company_name", "seniority_level", "title","position_description", "started_on", "ended_on",
    resultant_company_info_joined_df['amount_usd'],
    resultant_company_info_joined_df['headcount']
)

# Group by 'person_id' and collect list of employment details as structs
grouped_person_employments_df = resultant_company_info_employment_joined_df.groupBy("person_id").agg(
    F.collect_list(emplopyment_struct).alias("employments")
)

grouped_person_employments_df.show(1,truncate=False)


count_records(person_employments_df)
grouped_person_employments_df.printSchema()

+---------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [64]:
# Join the DataFrame 'people_person_customer_education_df' with 'person_employments_df' on 'person_id'
# The resulting DataFrame contains information about a person's education and employment
people_person_customer_education_educations_df = people_person_customer_education_df.join(grouped_person_employments_df, on='person_id', how='left') 

# people_person_customer_education_educations_df.show()

count_records(people_person_customer_education_educations_df)

999

***Transformation for the Person Social Urls Table***

In [65]:
# Joining person_education_group_df and people_person_customer_df DataFrames on column named 'person_id'

people_person_customer_education_employments_social_urls_df = people_person_customer_education_educations_df


In [66]:
check_uniqueness(people_person_customer_education_employments_social_urls_df, 'person_id')

DataFrame is unique based on person_id


***Final Dataframe***

In [67]:
# people_person_customer_education_employments_social_urls_df.printSchema()

In [68]:
people_person_customer_education_employments_social_urls_df.show()

+----------+-------------------+--------------------+--------------------+
| person_id|               name|           education|         employments|
+----------+-------------------+--------------------+--------------------+
| 949256266|        Vivian Weng|[{University of C...|[{7962172.0, Frog...|
| 787025923|        Lynda Zhang|                NULL|[{NaN, NaN, Manag...|
| 837503042|         Clara Têtu|                NULL|[{400708.0, Clari...|
| 789056453|  Federico Maggiani|                NULL|[{40487.0, MindSh...|
|  57288072|  Anthony Salvaggio|[{Monroe Communit...|[{1660.0, Apple, ...|
| 693540132|    Lika Razac-Ince|                NULL|[{NaN, NaN, NaN, ...|
|1017713620|      Sarina Studer|                NULL|[{NaN, NaN, NaN, ...|
| 856482513|        Karl Dubost|                NULL|[{NaN, NaN, Manag...|
| 118418234|     Kenny Sheridan|                NULL|[{NaN, NaN, NaN, ...|
|  64876372|        David Coyle|[{University Of W...|[{4376.0, Blackbe...|
| 741629676|  Iason Bakog

***Final Dataframe can be exported to files such as CSV, JSON, Parquet***

In [69]:
# Use the exiting dataframe
final_df = people_person_customer_education_employments_social_urls_df


In [71]:
final_df.show(1,truncate=False)

+---------+---------------+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [72]:
final_df.write.mode('overwrite').json('./work/final_df')

In [73]:
final_df.printSchema()


root
 |-- person_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- education: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- institution_name: string (nullable = true)
 |    |    |-- degree: string (nullable = true)
 |    |    |-- subject: string (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |-- employments: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- company_id: double (nullable = true)
 |    |    |-- company_name: string (nullable = true)
 |    |    |-- seniority_level: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- position_description: double (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |    |    |-- amount_usd: double (nullable = true)
 |    |    |-- headcount: long (nullable = true)



In [74]:
final_df.write.mode("overwrite").parquet("final_output/df4.parquet")


In [75]:
df = spark.read.parquet("final_output/df4.parquet")

In [76]:
df.show(1)

+---------+-----------+--------------------+--------------------+
|person_id|       name|           education|         employments|
+---------+-----------+--------------------+--------------------+
|949256266|Vivian Weng|[{University of C...|[{7962172.0, Frog...|
+---------+-----------+--------------------+--------------------+
only showing top 1 row



In [77]:
df.printSchema()

root
 |-- person_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- education: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- institution_name: string (nullable = true)
 |    |    |-- degree: string (nullable = true)
 |    |    |-- subject: string (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |-- employments: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- company_id: double (nullable = true)
 |    |    |-- company_name: string (nullable = true)
 |    |    |-- seniority_level: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- position_description: double (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |    |    |-- amount_usd: double (nullable = true)
 |    |    |-- headcount: long (nullable = true)



In [78]:
#date 16052024 - Dropping features,
from pyspark.sql.functions import expr

# Transform the 'employments' array, removing 'funding_name' field
df1 = df.withColumn(
    "employments",
    expr("transform(employments, x -> named_struct('company_name', x.company_name, 'seniority_level', x.seniority_level, 'title', x.title, 'position_description', x.position_description, 'started_on', x.started_on, 'ended_on', x.ended_on, 'amount_usd', x.amount_usd, 'headcount', x.headcount))")
)


In [79]:
df1.printSchema()


root
 |-- person_id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- education: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- institution_name: string (nullable = true)
 |    |    |-- degree: string (nullable = true)
 |    |    |-- subject: string (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |-- employments: array (nullable = true)
 |    |-- element: struct (containsNull = false)
 |    |    |-- company_name: string (nullable = true)
 |    |    |-- seniority_level: string (nullable = true)
 |    |    |-- title: string (nullable = true)
 |    |    |-- position_description: double (nullable = true)
 |    |    |-- started_on: string (nullable = true)
 |    |    |-- ended_on: string (nullable = true)
 |    |    |-- amount_usd: double (nullable = true)
 |    |    |-- headcount: long (nullable = true)



In [80]:
df1.show(1)

+---------+-----------+--------------------+--------------------+
|person_id|       name|           education|         employments|
+---------+-----------+--------------------+--------------------+
|949256266|Vivian Weng|[{University of C...|[{Frog, Director,...|
+---------+-----------+--------------------+--------------------+
only showing top 1 row



In [81]:
#date 16052024 -Build specific features, prompt tunning
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, LongType, StringType, ArrayType

# Define the schema for the DataFrame
schema = StructType([
    StructField("person_id", LongType(), True),
    StructField("name", StringType(), True),
    StructField("description", StringType(), True),
    StructField("education", ArrayType(StructType([
        StructField("institution_id", StringType(), True),
        StructField("institution_name", StringType(), True),
        StructField("degree", StringType(), True),
        StructField("subject", StringType(), True),
        StructField("started_on", StringType(), True),
        StructField("ended_on", StringType(), True)
    ])), True),
    StructField("employments", ArrayType(StructType([
        StructField("company_name", StringType(), True),
        StructField("seniority_level", StringType(), True),
        StructField("title", StringType(), True),
        StructField("position_description", StringType(), True),
        StructField("started_on", StringType(), True),
        StructField("ended_on", StringType(), True),
        StructField("amount_usd", LongType(), True),
        StructField("headcount", LongType(), True)
        
    ])), True)
])


In [82]:
json_df = df1.toJSON()

# Count the number of objects
num_objects = json_df.count()
print("Number of objects:", num_objects)

# Print the first object
first_object = json_df.first()
print("First object:")
print(first_object)


Number of objects: 999
First object:
{"person_id":949256266,"name":"Vivian Weng","education":[{"institution_name":"University of California, Berkeley - Walter A. Haas School of Business","degree":"NaN","subject":"MBA, Finance","started_on":"2007-01-01","ended_on":"2009-01-01"},{"institution_name":"National Taiwan University","degree":"NaN","subject":"BA, International Relations","started_on":"1998-01-01","ended_on":"2002-01-01"},{"institution_name":"Yale University","degree":"NaN","subject":"MA, International Developmental Economics","started_on":"2002-01-01","ended_on":"2003-01-01"}],"employments":[{"company_name":"Frog","seniority_level":"Director","title":"Associate Strategy Director, Innovation Strategy Group","position_description":"NaN","started_on":"02/01/2012","ended_on":"06/01/2016"},{"company_name":"DBS Bank","seniority_level":"VP","title":"Vice President, Innovation Group","position_description":"NaN","started_on":"07/01/2016","ended_on":"04/01/2019"},{"company_name":"Apple"

In [83]:
def save_json_objects(json_df, file_path):
    """
    Save all JSON objects in a DataFrame to a JSON file.

    Args:
        json_df (DataFrame): DataFrame containing JSON objects.
        file_path (str): The path where the JSON file will be saved.
    """
    json_objects = json_df.collect()  # Collect all JSON objects
    with open(file_path, 'w') as file:
        for json_obj in json_objects:
            file.write(json_obj + '\n')  # Write each JSON object to file
    print(f"All JSON objects saved to file: {file_path}")

In [47]:
save_json_objects(json_df, "input_people_data.json")

All JSON objects saved to file: input_people_data.json


In [48]:
print("\n****************completed ******************")


****************completed ******************
