In [36]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, countDistinct, isnan, when, split, size
from pyspark.sql.types import IntegerType, StringType
from pyspark.sql.functions import desc

In [2]:
spark = SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/19 19:01:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv("../name.basics.tsv", sep="\t", header=True, inferSchema=True)
df.show()



+---------+-------------------+---------+---------+--------------------+--------------------+
|   nconst|        primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+-------------------+---------+---------+--------------------+--------------------+
|nm0000001|       Fred Astaire|     1899|     1987|actor,miscellaneo...|tt0072308,tt00504...|
|nm0000002|      Lauren Bacall|     1924|     2014|actress,soundtrac...|tt0037382,tt00752...|
|nm0000003|    Brigitte Bardot|     1934|       \N|actress,music_dep...|tt0057345,tt00491...|
|nm0000004|       John Belushi|     1949|     1982|actor,writer,musi...|tt0072562,tt00779...|
|nm0000005|     Ingmar Bergman|     1918|     2007|writer,director,a...|tt0050986,tt00694...|
|nm0000006|     Ingrid Bergman|     1915|     1982|actress,producer,...|tt0034583,tt00381...|
|nm0000007|    Humphrey Bogart|     1899|     1957|actor,producer,mi...|tt0034583,tt00432...|
|nm0000008|      Marlon Brando|     1924|     2004|actor,dir

                                                                                

In [33]:
df.describe()

                                                                                

DataFrame[summary: string, nconst: string, primaryName: string, primaryProfession: string, knownForTitles: string]

In [25]:
df.count()

                                                                                

14267475

In [4]:
df = df.replace("\\N", None)
nan_counts = df.select([count(when(col(c).isNull() | isnan(col(c)), c)).alias(c) for c in df.columns])
print("NaN Counts per Column:")
nan_counts.show()

NaN Counts per Column:




+------+-----------+---------+---------+-----------------+--------------+
|nconst|primaryName|birthYear|deathYear|primaryProfession|knownForTitles|
+------+-----------+---------+---------+-----------------+--------------+
|     0|         63| 13626211| 14026006|          2795186|       1629078|
+------+-----------+---------+---------+-----------------+--------------+



                                                                                

In [26]:
missing_primary_name = df.filter(col("primaryName").isNull())
missing_primary_name.show(n=df.count(), truncate=False)

[Stage 29:>                                                       (0 + 11) / 11]

+----------+-----------+---------+---------+-----------------+--------------+
|nconst    |primaryName|birthYear|deathYear|primaryProfession|knownForTitles|
+----------+-----------+---------+---------+-----------------+--------------+
|nm10019610|null       |null     |null     |null             |null          |
|nm10040984|null       |null     |null     |null             |null          |
|nm10405838|null       |null     |null     |null             |null          |
|nm10730114|null       |null     |null     |null             |null          |
|nm10747123|null       |null     |null     |null             |null          |
|nm12177858|null       |null     |null     |null             |null          |
|nm12294468|null       |null     |null     |null             |null          |
|nm12364432|null       |null     |null     |null             |null          |
|nm12427523|null       |null     |null     |null             |null          |
|nm12428300|null       |null     |null     |null             |nu

                                                                                

In [29]:
df = df.dropna(subset=["primaryName"])
df = df.drop("birthYear", "deathYear")

In [30]:
df.count()

                                                                                

14267424

In [31]:
df.show()

+---------+-------------------+--------------------+--------------------+
|   nconst|        primaryName|   primaryProfession|      knownForTitles|
+---------+-------------------+--------------------+--------------------+
|nm0000001|       Fred Astaire|actor,miscellaneo...|tt0072308,tt00504...|
|nm0000002|      Lauren Bacall|actress,soundtrac...|tt0037382,tt00752...|
|nm0000003|    Brigitte Bardot|actress,music_dep...|tt0057345,tt00491...|
|nm0000004|       John Belushi|actor,writer,musi...|tt0072562,tt00779...|
|nm0000005|     Ingmar Bergman|writer,director,a...|tt0050986,tt00694...|
|nm0000006|     Ingrid Bergman|actress,producer,...|tt0034583,tt00381...|
|nm0000007|    Humphrey Bogart|actor,producer,mi...|tt0034583,tt00432...|
|nm0000008|      Marlon Brando|actor,director,wr...|tt0078788,tt00686...|
|nm0000009|     Richard Burton|actor,producer,di...|tt0061184,tt00878...|
|nm0000010|       James Cagney|actor,director,pr...|tt0029870,tt00318...|
|nm0000011|        Gary Cooper|actor,s

In [43]:
df_split_proffesion = df.withColumn("profession_count", size(split(col("primaryProfession"), ",")))
df_single_profession = df_split_proffesion.filter(col("profession_count") == 1).drop("profession_count")

In [40]:
df_split_proffesion.show()

+---------+-------------------+--------------------+--------------------+----------------+
|   nconst|        primaryName|   primaryProfession|      knownForTitles|profession_count|
+---------+-------------------+--------------------+--------------------+----------------+
|nm0000001|       Fred Astaire|actor,miscellaneo...|tt0072308,tt00504...|               3|
|nm0000002|      Lauren Bacall|actress,soundtrac...|tt0037382,tt00752...|               3|
|nm0000003|    Brigitte Bardot|actress,music_dep...|tt0057345,tt00491...|               3|
|nm0000004|       John Belushi|actor,writer,musi...|tt0072562,tt00779...|               3|
|nm0000005|     Ingmar Bergman|writer,director,a...|tt0050986,tt00694...|               3|
|nm0000006|     Ingrid Bergman|actress,producer,...|tt0034583,tt00381...|               3|
|nm0000007|    Humphrey Bogart|actor,producer,mi...|tt0034583,tt00432...|               3|
|nm0000008|      Marlon Brando|actor,director,wr...|tt0078788,tt00686...|               3|

In [44]:
df_single_profession.show()

+---------+-------------------+--------------------+--------------------+
|   nconst|        primaryName|   primaryProfession|      knownForTitles|
+---------+-------------------+--------------------+--------------------+
|nm0000089|       Richard Paul|    actor,soundtrack|tt0117318,tt00754...|
|nm0000109|     Yasmine Bleeth|actress,archive_f...|tt0131857,tt03378...|
|nm0000143|      Erika Eleniak|actress,archive_f...|tt0105690,tt00838...|
|nm0000253|       Robert Ellis|art_director,misc...|tt0010965,tt00140...|
|nm0000282|     Scott Bairstow|actor,archive_foo...|tt0119925,tt02830...|
|nm0000283|       Brenda Bakke|actress,archive_f...|tt0119488,tt01071...|
|nm0000314|    Charles Bronson|actor,archive_foo...|tt0071402,tt00641...|
|nm0000315|      Louise Brooks|actress,archive_f...|tt0018737,tt00173...|
|nm0000319|       Yancy Butler|    actress,producer|tt1650554,tt01070...|
|nm0000357|  Lolita Davidovich|actress,archive_f...|tt0329717,tt00969...|
|nm0000383|      Jennifer Ehle|actress

In [45]:
missing_primary_name = df.filter(col("knownForTitles").isNull())
missing_primary_name.show()

+---------+--------------------+--------------------+--------------+
|   nconst|         primaryName|   primaryProfession|knownForTitles|
+---------+--------------------+--------------------+--------------+
|nm0003936|Eivind Fredagsvik...|                null|          null|
|nm0006629| Christian Daugherty|camera_department...|          null|
|nm0007270|           James 52X|     archive_footage|          null|
|nm0007315|        Linus Aaberg|        art_director|          null|
|nm0007316|        Linus Aaberg|     special_effects|          null|
|nm0007356|        John Aalberg|sound_department,...|          null|
|nm0007364|      Joan Aalestrup|  make_up_department|          null|
|nm0007447|       Charles Aaron|               actor|          null|
|nm0007470|         Lindy Aaron|             actress|          null|
|nm0007474|     Nancy-Rae Aaron|             actress|          null|
|nm0007560|        Major Aaxton|              editor|          null|
|nm0007570|         Elena Ababy|  

In [46]:
df.filter(col("primaryProfession").isNull() & col("knownForTitles").isNull()).count()

                                                                                

1214003

In [47]:
df = df.dropna(subset=["primaryProfession", "knownForTitles"], how="all")

In [48]:
df.filter(col("primaryProfession").isNull() & col("knownForTitles").isNull()).count()

                                                                                

0

In [49]:
df.count()

                                                                                

13053421

In [50]:
df.show()

+---------+-------------------+--------------------+--------------------+
|   nconst|        primaryName|   primaryProfession|      knownForTitles|
+---------+-------------------+--------------------+--------------------+
|nm0000001|       Fred Astaire|actor,miscellaneo...|tt0072308,tt00504...|
|nm0000002|      Lauren Bacall|actress,soundtrac...|tt0037382,tt00752...|
|nm0000003|    Brigitte Bardot|actress,music_dep...|tt0057345,tt00491...|
|nm0000004|       John Belushi|actor,writer,musi...|tt0072562,tt00779...|
|nm0000005|     Ingmar Bergman|writer,director,a...|tt0050986,tt00694...|
|nm0000006|     Ingrid Bergman|actress,producer,...|tt0034583,tt00381...|
|nm0000007|    Humphrey Bogart|actor,producer,mi...|tt0034583,tt00432...|
|nm0000008|      Marlon Brando|actor,director,wr...|tt0078788,tt00686...|
|nm0000009|     Richard Burton|actor,producer,di...|tt0061184,tt00878...|
|nm0000010|       James Cagney|actor,director,pr...|tt0029870,tt00318...|
|nm0000011|        Gary Cooper|actor,s

In [None]:
import os
import shutil

output_path = "../output_folder"
output_file = "../name.basics_cleaned.tsv"

df.coalesce(1).write.option("header", True).option("sep", "\t").mode("overwrite").csv(output_path)

In [58]:
for file in os.listdir(output_path):
    print(file)
    if file.startswith("part-") and file.endswith(".csv"):
        shutil.move(os.path.join(output_path, file), output_file)
        break

shutil.rmtree(output_path)

.part-00000-cb169563-d1d4-44b6-942a-b5aed19b62bd-c000.csv.crc
._SUCCESS.crc
_SUCCESS
