In [4]:
# Install Java and Spark on Google Colab
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
!tar xf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

import findspark
findspark.init()


In [5]:
# Import PySpark and start a SparkSession
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("BigDataMLJobsAnalysis").getOrCreate()

In [6]:
# Load the CSV data into a Spark DataFrame
file_path = "/content/1000_ml_jobs_us.csv"
df = spark.read.csv(file_path, header=True, inferSchema=True)

# Show schema and preview data
df.printSchema()
df.show(5)

root
 |-- _c0: string (nullable = true)
 |-- job_posted_date: string (nullable = true)
 |-- company_address_locality: string (nullable = true)
 |-- company_address_region: string (nullable = true)
 |-- company_name: string (nullable = true)
 |-- company_website: string (nullable = true)
 |-- company_description: string (nullable = true)
 |-- job_description_text: string (nullable = true)
 |-- seniority_level: string (nullable = true)
 |-- job_title: string (nullable = true)

+--------------------+-------------------+------------------------+----------------------+--------------------+--------------------+--------------------+--------------------+---------------+--------------------+
|                 _c0|    job_posted_date|company_address_locality|company_address_region|        company_name|     company_website| company_description|job_description_text|seniority_level|           job_title|
+--------------------+-------------------+------------------------+----------------------+------

In [7]:
# Basic data cleaning and exploration
# Count total rows
total_jobs = df.count()
print(f"Total ML Job Listings: {total_jobs}")

Total ML Job Listings: 3210


In [9]:
# Top 10 job titles by frequency
print("\nTop 10 Job Titles:")
df.groupBy("job_title").count().orderBy("count", ascending=False).show(10, truncate=False)


Top 10 Job Titles:
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|job_title                                                                                                                                                                                                                                    |count|
+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----+
|NULL                                                                                                                                                                                                                                         |258

In [10]:
#  Top 10 companies hiring the most
print("\nTop 10 Companies:")
df.groupBy("company_name").count().orderBy("count", ascending=False).show(10, truncate=False)


Top 10 Companies:
+------------------------------------------------------------------------+-----+
|company_name                                                            |count|
+------------------------------------------------------------------------+-----+
|NULL                                                                    |1295 |
| Europe                                                                 |101  |
|TikTok                                                                  |88   |
| Instagram and WhatsApp further empowered billions around the world. Now|39   |
|Meta                                                                    |39   |
|Machine Learning Engineer                                               |29   |
|Snap Inc.                                                               |18   |
|Adobe                                                                   |18   |
|Splunk                                                                  |18   |
|DoorDash

In [11]:
#  Top 5 states with most job listings (assuming 'Location' column has state info)
if "Location" in df.columns:
    from pyspark.sql.functions import split
    df_states = df.withColumn("State", split(col("Location"), ", ").getItem(1))
    print("\nTop 5 States by Job Listings:")
    df_states.groupBy("State").count().orderBy("count", ascending=False).show(5)

In [13]:
# Show more insights (Example: Jobs by company and job title)
print("\nJobs Count by Company and Job Title:")
df.groupBy("company_name", "job_title").count().orderBy("count", ascending=False).show(10, truncate=False)

# Stop the Spark session
spark.stop()


Jobs Count by Company and Job Title:
+------------------------------------------------------------------------+------------------------+-----+
|company_name                                                            |job_title               |count|
+------------------------------------------------------------------------+------------------------+-----+
|NULL                                                                    |NULL                    |1295 |
|TikTok                                                                  |NULL                    |88   |
| Europe                                                                 |NULL                    |62   |
| Instagram and WhatsApp further empowered billions around the world. Now|NULL                    |39   |
|Meta                                                                    |NULL                    |39   |
|Machine Learning Engineer                                               |NULL                    |29   |
|Snap In