In [None]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder \
    .appName("zestaw4") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "5g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "2") \
    .getOrCreate()
    
sc = spark.sparkContext

spark

In [1]:
from pyspark.sql import SparkSession

# Spark session & context
spark = SparkSession.builder \
    .appName("zestaw4") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "5g") \
    .getOrCreate()
    
sc = spark.sparkContext

spark

In [2]:
# input_dir = "gs://pbd-24-jg-1/project1_data"
input_dir = "/home/jovyan/data"
# NIE ZMIENIAĆ
# ścieżki dla danych źródłowych 
datasource1_dir = input_dir + "/datasource11"
datasource4_dir = input_dir + "/datasource4"

# nazwy i ścieżki dla wyników dla misji głównej 
# część 1 (Spark Core - RDD) 
rdd_result_dir = "/tmp/output1"

# część 2 (Spark SQL - DataFrame)
df_result_table = "output2"

# część 3 (Pandas API on Spark)
ps_result_file = "/tmp/output3.json"

In [13]:
import pyspark.pandas as ps

# Loading data
datasource1 = ps.read_csv(datasource1_dir, sep="\t", header=None)
datasource1.columns = ["tconst", "ordering", "nconst", "role", "job", "characters"]

datasource4 = ps.read_csv(datasource4_dir, sep="\t", header=0)
datasource4.columns = ["nconst", "primaryName", "birthYear", "deathYear", "primaryProfession", "knownForTitles"]

# Normalize roles to "performer"
normalized_roles = datasource1.assign(
    normalized_role=datasource1["role"].apply(
        lambda x: "performer" if x in ["actor", "actress", "self"] else x
    )
)

# Filter for full-cast movies
def filter_roles(roles):
    return "performer" in roles and "director" in roles and len(roles) > 3

roles_grouped = normalized_roles.groupby("tconst").agg({"normalized_role": "collect_set"}).reset_index()
roles_grouped.columns = ["tconst", "roles"]
full_cast_movies = roles_grouped[roles_grouped["roles"].apply(filter_roles)][["tconst"]]

# Select roles from full-cast movies
full_cast_roles = full_cast_movies.merge(datasource1, on="tconst")[["tconst", "nconst", "role"]]
full_cast_roles_count = full_cast_roles.groupby(["nconst", "role"]).agg({"tconst": "count"}).reset_index()
full_cast_roles_count.columns = ["nconst", "profession", "movies"]

# Filter professions
actor_data = datasource4.assign(
    profession=datasource4["primaryProfession"].apply(lambda x: x.split(",") if isinstance(x, str) else [])
).explode("profession")
actor_data = actor_data[actor_data["profession"] != "miscellaneous"]

# Determine top professions
top_professions = actor_data.groupby("profession").agg({"nconst": "count"}).reset_index()
top_professions.columns = ["profession", "count"]
top_professions = top_professions.sort_values("count", ascending=False).head(4)

# Combine roles with top professions
movies_per_person = (
    full_cast_roles_count
    .merge(datasource4[["nconst", "primaryName"]], on="nconst", how="inner")
    .merge(top_professions[["profession"]], on="profession", how="inner")
)


# get top3s
movies_per_person_top3 = movies_per_person.groupby("profession").apply(
    lambda group: group.sort_values(by=["movies"], ascending=[False]).head(3)
)

persons_dropped = movies_per_person_top3.reset_index(drop=True)
persons_to_json = persons_dropped[["profession", "primaryName", "movies"]]




In [14]:
persons_to_json

Unnamed: 0,profession,primaryName,movies
0,actor,Vivek Gopan,17
1,actor,Luis Eduardo Motoa,12
2,actor,Ronit Roy,11
3,actress,Gayathri Arun,17
4,actress,Smriti Malhotra-Irani,16
5,actress,Divyanka Tripathi,13
6,producer,Shobha Kapoor,70
7,producer,Ekta Kapoor,49
8,producer,Ernesto Alonso,28
9,writer,Pradeep Panicker,24
