In [19]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as f

In [11]:
spark = SparkSession.builder.getOrCreate()

In [38]:
df = spark.read.csv("train.csv", header=True, inferSchema=True)

In [40]:
df.createOrReplaceTempView("titanic")

In [None]:
shape = len(df.columns), df.count()
print(shape)

(12, 891)


In [42]:
spark.sql(
    """
    SELECT COUNT(*) FROM titanic
    """
).show()

+--------+
|count(1)|
+--------+
|     891|
+--------+



In [13]:
df.printSchema()

root
 |-- PassengerId: integer (nullable = true)
 |-- Survived: integer (nullable = true)
 |-- Pclass: integer (nullable = true)
 |-- Name: string (nullable = true)
 |-- Sex: string (nullable = true)
 |-- Age: double (nullable = true)
 |-- SibSp: integer (nullable = true)
 |-- Parch: integer (nullable = true)
 |-- Ticket: string (nullable = true)
 |-- Fare: double (nullable = true)
 |-- Cabin: string (nullable = true)
 |-- Embarked: string (nullable = true)



In [16]:
cleaned_df = df.na.drop()

In [17]:
shape_of_cleaneddf = len(cleaned_df.columns), cleaned_df.count()
print(shape_of_cleaneddf)

(12, 183)


In [None]:
df.select(
        [f.count(
                f.when(
                    f.col(c).isNull(), c
                    )
                ).alias(c)
        for c in df.columns]
    ).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+



In [None]:
spark.sql(
    """
    SELECT 
        SUM(CASE WHEN PassengerId IS NULL THEN 1 ELSE 0 END) AS PassengerId,
        SUM(CASE WHEN Survived IS NULL THEN 1 ELSE 0 END) AS Survived,
        SUM(CASE WHEN Pclass IS NULL THEN 1 ELSE 0 END) AS Pclass,
        SUM(CASE WHEN Name IS NULL THEN 1 ELSE 0 END) AS Name,
        SUM(CASE WHEN Sex IS NULL THEN 1 ELSE 0 END) AS Sex,
        SUM(CASE WHEN Age IS NULL THEN 1 ELSE 0 END) AS Age,
        SUM(CASE WHEN SibSp IS NULL THEN 1 ELSE 0 END) AS SibSp,
        SUM(CASE WHEN Parch IS NULL THEN 1 ELSE 0 END) AS Parch,
        SUM(CASE WHEN Ticket IS NULL THEN 1 ELSE 0 END) AS Ticket,
        SUM(CASE WHEN Fare IS NULL THEN 1 ELSE 0 END) AS Fare,
        SUM(CASE WHEN Cabin IS NULL THEN 1 ELSE 0 END) AS Cabin,
        SUM(CASE WHEN Embarked IS NULL THEN 1 ELSE 0 END) AS Embarked
    FROM titanic
    """
).show()

+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|PassengerId|Survived|Pclass|Name|Sex|Age|SibSp|Parch|Ticket|Fare|Cabin|Embarked|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+
|          0|       0|     0|   0|  0|177|    0|    0|     0|   0|  687|       2|
+-----------+--------+------+----+---+---+-----+-----+------+----+-----+--------+

