In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, countDistinct, isnan, when
import pyspark.sql.types as t 
from pyspark.sql.functions import desc
from pyspark.sql import functions as F



In [2]:
def get_counts_and_distinct_values(df, col_name):
    count_df = df.groupBy(col_name).count()
    distinct_df = df.select(col_name).distinct().orderBy(F.col(col_name).desc())
    result_df = count_df.join(distinct_df, on=col_name, how="inner")
    return result_df


def filter_df_by_column_values(df, col_name, values, invert_filter=False):
    if invert_filter:
        condition = ~F.col(col_name).isin(values)
    else:
        condition = F.col(col_name).isin(values)
    
    return df.filter(condition)
    

In [3]:
spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/19 22:09:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### **title.basics.tsv**

|      Name      |    Type      |   Description   |
| -------------  | ------------ | ----------------|
| tconst         | string       | alphanumeric unique identifier of the title |
| titleType      | string       | the type/format of the title (e.g. movie, short, tvseries, tvepisode, video, etc) |
| primaryTitle   | string       | the more popular title / the title used by the filmmakers on promotional materials at the point of release |
| originalTitle  | string       | original title, in the original language |
| isAdult        | boolean      | 0: non-adult title; 1: adult title |
| startYear      | YYYY         | represents the release year of a title. In the case of TV Series, it is the series start year |
| endYear        | YYYY         | TV Series end year. ‘\N’ for all other title types |
| runtimeMinutes | integer      | primary runtime of the title, in minutes |
| genres         | string array | includes up to three genres associated with the title |

In [4]:
title_basics_schema = t.StructType([
    t.StructField('tconst', t.StringType(), True),
    t.StructField('titleType', t.StringType(), True),
    t.StructField('primaryTitle', t.StringType(), True),
    t.StructField('originalTitle', t.StringType(), True),
    t.StructField('isAdult', t.IntegerType(), True),
    t.StructField('startYear', t.DateType(), True),
    t.StructField('endYear', t.DateType(), True),
    t.StructField('runtimeMinutes', t.IntegerType(), True),
    t.StructField('genres', t.StringType(), True),
])

In [5]:
df = spark.read.csv(f'../data/title.basics.tsv', 
                    sep=r'\t', 
                    header=True,
                    schema=title_basics_schema,
                    inferSchema=True)
df.show()

+---------+---------+--------------------+--------------------+-------+----------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult| startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+----------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|1894-01-01|   null|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|1892-01-01|   null|             5|     Animation,Short|
|tt0000003|    short|        Poor Pierrot|      Pauvre Pierrot|      0|1892-01-01|   null|             5|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|1892-01-01|   null|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|1893-01-01|   null|             1|      

                                                                                

In [6]:
row_count = df.count()
print(f"Total Rows: {row_count}")



Total Rows: 11530944


                                                                                

In [6]:
df = df.withColumn('genres', F.when(F.col('genres') == r'\N', None).otherwise(F.col('genres')))
# df = df.withColumn('genres', F.split('genres', ','))


In [7]:
df = df.replace("\\N", None)

nan_counts = df.select([
    (
        F.count(F.when((F.isnan(c) | F.col(c).isNull()), c)) 
        if t in ("float", "double") 
        else F.count(F.when(F.col(c).isNull(), c))
    ).alias(c)
    for c, t in df.dtypes if c in df.columns  
])

print("NaN Counts per Column:")
nan_counts.show()


NaN Counts per Column:




+------+---------+------------+-------------+-------+---------+--------+--------------+------+
|tconst|titleType|primaryTitle|originalTitle|isAdult|startYear| endYear|runtimeMinutes|genres|
+------+---------+------------+-------------+-------+---------+--------+--------------+------+
|     0|        0|           0|            0|      1|  1425470|11393338|       7852394|508808|
+------+---------+------------+-------------+-------+---------+--------+--------------+------+



                                                                                

In [8]:
unique_counts = df.select([countDistinct(c).alias(c) for c in df.columns])
print("Unique Values per Column:")
unique_counts.show()

Unique Values per Column:




+--------+---------+------------+-------------+-------+---------+-------+--------------+------+
|  tconst|titleType|primaryTitle|originalTitle|isAdult|startYear|endYear|runtimeMinutes|genres|
+--------+---------+------------+-------------+-------+---------+-------+--------------+------+
|11530944|       11|     5184830|      5209859|     44|      151|     95|           939|  2384|
+--------+---------+------------+-------------+-------+---------+-------+--------------+------+



                                                                                

## isAdult column

In [8]:
distinct_isAdult = get_counts_and_distinct_values(df, "isAdult")
distinct_isAdult.show()

                                                                                

+-------+--------+
|isAdult|   count|
+-------+--------+
|      1|  372479|
|   2019|       7|
|      0|11157630|
|   2020|       9|
|   1981|      18|
|   2017|      17|
|   2023|       7|
|   2022|       2|
|   2011|       1|
|   1959|       1|
|   1975|      18|
|   1977|      20|
|   2018|       9|
|   1974|      33|
|   2015|      28|
|   1978|     130|
|   2013|       8|
|   1988|       5|
|   1968|      24|
|   2014|       6|
+-------+--------+
only showing top 20 rows



In [9]:
isAdult_ordered = distinct_isAdult.select("isAdult").distinct().orderBy(F.col("isAdult").desc())

for value in isAdult_ordered.head(1):  
    value = value['isAdult']
    df.filter(F.col("isAdult") == value).show()


[Stage 17:>                                                         (0 + 3) / 3]

+----------+---------+--------------------+-------------+-------+---------+-------+--------------+------+
|    tconst|titleType|        primaryTitle|originalTitle|isAdult|startYear|endYear|runtimeMinutes|genres|
+----------+---------+--------------------+-------------+-------+---------+-------+--------------+------+
|tt32679650|tvEpisode|"It's pure TRASH!...|            0|   2024|     null|   null|          null|  null|
|tt33044617|tvEpisode|"Superobscuredom ...|            0|   2024|     null|   null|          null|  null|
|tt34458820|tvEpisode|"A Day In The Lif...|            1|   2024|     null|   null|          null|  null|
|tt34543329|tvEpisode|"Men-Oh-Pause\t"M...|            0|   2024|     null|   null|          null|  null|
+----------+---------+--------------------+-------------+-------+---------+-------+--------------+------+



                                                                                

In [9]:
values_to_filter = [0, 1]
df_filtered = filter_df_by_column_values(df, "isAdult", values_to_filter)

distinct_isAdult = get_counts_and_distinct_values(df_filtered, "isAdult")
distinct_isAdult.show()


                                                                                

+-------+--------+
|isAdult|   count|
+-------+--------+
|      1|  372479|
|      0|11157630|
+-------+--------+



## originalTitle and primaryTitle columns

In [10]:
values_to_filter = [0, 1]
df_filtered = filter_df_by_column_values(df_filtered, "originalTitle", values_to_filter, invert_filter=True)
df_filtered = filter_df_by_column_values(df_filtered, "primaryTitle", values_to_filter, invert_filter=True)

df_filtered.show()

+---------+---------+--------------------+--------------------+-------+----------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult| startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+----------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|1894-01-01|   null|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|1892-01-01|   null|             5|     Animation,Short|
|tt0000003|    short|        Poor Pierrot|      Pauvre Pierrot|      0|1892-01-01|   null|             5|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|1892-01-01|   null|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|1893-01-01|   null|             1|      

## runtimeMinutes column

In [21]:
count_null_runtime = df.groupBy("titleType").agg(
    F.count(F.when(F.col("runtimeMinutes").isNull(), 1)).alias("null_runtime_count"),
    F.count("*").alias("total_count")
)

# Show the result
count_null_runtime.show()




+------------+------------------+-----------+
|   titleType|null_runtime_count|total_count|
+------------+------------------+-----------+
|    tvSeries|            173981|     278607|
|tvMiniSeries|             39629|      60474|
|     tvMovie|             47372|     150250|
|   tvEpisode|           6783335|    8869402|
|       movie|            262614|     709649|
|   tvSpecial|             26747|      51665|
|       video|             99366|     307759|
|   videoGame|             41873|      42319|
|     tvShort|              1321|      10591|
|       short|            376155|    1050227|
|     tvPilot|                 1|          1|
+------------+------------------+-----------+



                                                                                

## start and end year columns

In [22]:
from pyspark.sql import functions as F

min_max_years = df.select(
    F.min("startYear").alias("min_startYear"),
    F.max("startYear").alias("max_startYear"),
    F.min("endYear").alias("min_endYear"),
    F.max("endYear").alias("max_endYear")
)

min_max_years.show()




+-------------+-------------+-----------+-----------+
|min_startYear|max_startYear|min_endYear|max_endYear|
+-------------+-------------+-----------+-----------+
|   1874-01-01|   2031-01-01| 1928-01-01| 2030-01-01|
+-------------+-------------+-----------+-----------+



                                                                                

## genres column

In [11]:
df_filtered_genres_null = df.filter(F.col("genres").isNull())

# Show the result
df_filtered_genres_null.show()

+---------+---------+--------------------+--------------------+-------+----------+-------+--------------+------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult| startYear|endYear|runtimeMinutes|genres|
+---------+---------+--------------------+--------------------+-------+----------+-------+--------------+------+
|tt0000502|    movie|            Bohemios|            Bohemios|      0|1905-01-01|   null|           100|  null|
|tt0000838|    movie|  A Cultura do Cacau|  A Cultura do Cacau|      0|1909-01-01|   null|          null|  null|
|tt0000842|    movie|De Garraf a Barce...|De Garraf a Barce...|      0|1909-01-01|   null|          null|  null|
|tt0000846|    movie|Un día en Xochimilco|Un día en Xochimilco|      0|1909-01-01|   null|          null|  null|
|tt0000850|    movie|    Los dos hermanos|    Los dos hermanos|      0|1909-01-01|   null|          null|  null|
|tt0000859|    movie|Fabricación del c...|Fabricación del c...|      0|1909-01-01|   null|      

In [13]:
unique_counts = df_filtered.select([countDistinct(c).alias(c) for c in df.columns])
print("Unique Values per Column:")
unique_counts.show()

Unique Values per Column:




+--------+---------+------------+-------------+-------+---------+-------+--------------+------+
|  tconst|titleType|primaryTitle|originalTitle|isAdult|startYear|endYear|runtimeMinutes|genres|
+--------+---------+------------+-------------+-------+---------+-------+--------------+------+
|11529855|       11|     5184136|      5209857|      2|      151|     95|           939|  2384|
+--------+---------+------------+-------------+-------+---------+-------+--------------+------+



                                                                                