# Manipulating Data in DataFrames

In [5]:
%config Completer.use_jedi = False

import os
import sys
import shutil

BASE_DIR = os.path.realpath(os.path.join(os.getcwd(), "..", ".."))

if not BASE_DIR in sys.path:
    sys.path.append(BASE_DIR)
    
from utils import extract_zip

DATASETS_PATH = "datasets/"

In [16]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.appName("Manipulate").getOrCreate()
spark

In [7]:
data_file = extract_zip(
    zip_file=os.path.join(DATASETS_PATH, "youtubevideos.csv.zip"),
    member="youtubevideos.csv"
)

data_file

'/tmp/aaaaa-islp6bsa/youtubevideos.csv'

In [8]:
videos = spark.read.csv(data_file, inferSchema=True, header=True)
videos.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [11]:
videos.limit(5).toPandas()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,17.14.11,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13T17:13:01.000Z,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,17.14.11,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13T07:30:00.000Z,"""last week tonight trump presidency""|""last wee...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,17.14.11,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12T19:05:24.000Z,"""racist superman""|""rudy""|""mancuso""|""king""|""bac...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,17.14.11,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13T11:00:04.000Z,"""rhett and link""|""gmm""|""good mythical morning""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,17.14.11,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12T18:01:41.000Z,"""ryan""|""higa""|""higatv""|""nigahiga""|""i dare you""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


__Available types:__  

* ByteType
* ShortType
* IntegerType
* LongType
* FloatType
* DoubleType
* DecimalType
* StringType
* BinaryType
* BooleanType
* TimestampType
* DateType
* ArrayType
* MapType
* StructType

In [13]:
videos.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: string (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: string (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: string (nullable = true)
 |-- likes: string (nullable = true)
 |-- dislikes: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [59]:
df = (
    videos
    .withColumn("views", videos["views"].cast(IntegerType()))
    .withColumn("likes", videos["likes"].cast(IntegerType()))
    .withColumn("dislikes", videos["dislikes"].cast(IntegerType()))
    .withColumn("trending_date", to_date(videos["trending_date"], "yy.dd.mm"))
    .withColumn("publish_time", to_timestamp(col("publish_time")))
)

In [46]:
df.printSchema()

root
 |-- video_id: string (nullable = true)
 |-- trending_date: date (nullable = true)
 |-- title: string (nullable = true)
 |-- channel_title: string (nullable = true)
 |-- category_id: string (nullable = true)
 |-- publish_time: timestamp (nullable = true)
 |-- tags: string (nullable = true)
 |-- views: integer (nullable = true)
 |-- likes: integer (nullable = true)
 |-- dislikes: integer (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- thumbnail_link: string (nullable = true)
 |-- comments_disabled: string (nullable = true)
 |-- ratings_disabled: string (nullable = true)
 |-- video_error_or_removed: string (nullable = true)
 |-- description: string (nullable = true)



In [47]:
df.limit(5).toPandas()

Unnamed: 0,video_id,trending_date,title,channel_title,category_id,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,video_error_or_removed,description
0,2kyS6SvSYSE,2017-01-14,WE WANT TO TALK ABOUT OUR MARRIAGE,CaseyNeistat,22,2017-11-13 15:13:01,SHANtell martin,748374,57527,2966,15954,https://i.ytimg.com/vi/2kyS6SvSYSE/default.jpg,False,False,False,SHANTELL'S CHANNEL - https://www.youtube.com/s...
1,1ZAPwfrtAFY,2017-01-14,The Trump Presidency: Last Week Tonight with J...,LastWeekTonight,24,2017-11-13 05:30:00,"""last week tonight trump presidency""|""last wee...",2418783,97185,6146,12703,https://i.ytimg.com/vi/1ZAPwfrtAFY/default.jpg,False,False,False,"One year after the presidential election, John..."
2,5qpjK5DgCt4,2017-01-14,"Racist Superman | Rudy Mancuso, King Bach & Le...",Rudy Mancuso,23,2017-11-12 17:05:24,"""racist superman""|""rudy""|""mancuso""|""king""|""bac...",3191434,146033,5339,8181,https://i.ytimg.com/vi/5qpjK5DgCt4/default.jpg,False,False,False,WATCH MY PREVIOUS VIDEO ▶ \n\nSUBSCRIBE ► http...
3,puqaWrEC7tY,2017-01-14,Nickelback Lyrics: Real or Fake?,Good Mythical Morning,24,2017-11-13 09:00:04,"""rhett and link""|""gmm""|""good mythical morning""...",343168,10172,666,2146,https://i.ytimg.com/vi/puqaWrEC7tY/default.jpg,False,False,False,Today we find out if Link is a Nickelback amat...
4,d380meD0W0M,2017-01-14,I Dare You: GOING BALD!?,nigahiga,24,2017-11-12 16:01:41,"""ryan""|""higa""|""higatv""|""nigahiga""|""i dare you""...",2095731,132235,1989,17518,https://i.ytimg.com/vi/d380meD0W0M/default.jpg,False,False,False,I know it's been a while since we did this sho...


__Translate Function__

In [51]:
(
    videos
    .select(
        "publish_time",
        translate(col("publish_time"), "TZ", " ").alias("translated")
    )
    .show(truncate=False)
)

+------------------------+-----------------------+
|publish_time            |translated             |
+------------------------+-----------------------+
|2017-11-13T17:13:01.000Z|2017-11-13 17:13:01.000|
|2017-11-13T07:30:00.000Z|2017-11-13 07:30:00.000|
|2017-11-12T19:05:24.000Z|2017-11-12 19:05:24.000|
|2017-11-13T11:00:04.000Z|2017-11-13 11:00:04.000|
|2017-11-12T18:01:41.000Z|2017-11-12 18:01:41.000|
|2017-11-13T19:07:23.000Z|2017-11-13 19:07:23.000|
|2017-11-12T05:37:17.000Z|2017-11-12 05:37:17.000|
|2017-11-12T21:50:37.000Z|2017-11-12 21:50:37.000|
|2017-11-13T14:00:23.000Z|2017-11-13 14:00:23.000|
|2017-11-13T13:45:16.000Z|2017-11-13 13:45:16.000|
|2017-11-13T02:05:26.000Z|2017-11-13 02:05:26.000|
|2017-11-13T03:00:00.000Z|2017-11-13 03:00:00.000|
|2017-11-13T17:00:00.000Z|2017-11-13 17:00:00.000|
|2017-11-12T14:00:00.000Z|2017-11-12 14:00:00.000|
|2017-11-12T18:30:01.000Z|2017-11-12 18:30:01.000|
|2017-11-13T20:09:58.000Z|2017-11-13 20:09:58.000|
|2017-11-12T17:00:05.000Z|2017-

__Trim Function (RTrim + LTrim)__

In [60]:
df.select("title").show(4, False)

+--------------------------------------------------------------+
|title                                                         |
+--------------------------------------------------------------+
|WE WANT TO TALK ABOUT OUR MARRIAGE                            |
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)|
|Racist Superman | Rudy Mancuso, King Bach & Lele Pons         |
|Nickelback Lyrics: Real or Fake?                              |
+--------------------------------------------------------------+
only showing top 4 rows



In [61]:
df = df.withColumn("title", trim(df.title))

In [62]:
df.select("title").show(4, False)

+--------------------------------------------------------------+
|title                                                         |
+--------------------------------------------------------------+
|WE WANT TO TALK ABOUT OUR MARRIAGE                            |
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)|
|Racist Superman | Rudy Mancuso, King Bach & Lele Pons         |
|Nickelback Lyrics: Real or Fake?                              |
+--------------------------------------------------------------+
only showing top 4 rows



__Lower Function__

In [64]:
df.select("title", lower(df.title).alias("lower")).show(5, truncate=False)

+--------------------------------------------------------------+--------------------------------------------------------------+
|title                                                         |lower                                                         |
+--------------------------------------------------------------+--------------------------------------------------------------+
|WE WANT TO TALK ABOUT OUR MARRIAGE                            |we want to talk about our marriage                            |
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)|the trump presidency: last week tonight with john oliver (hbo)|
|Racist Superman | Rudy Mancuso, King Bach & Lele Pons         |racist superman | rudy mancuso, king bach & lele pons         |
|Nickelback Lyrics: Real or Fake?                              |nickelback lyrics: real or fake?                              |
|I Dare You: GOING BALD!?                                      |i dare you: going bald!?                

__case / when__

In [76]:
# Option 1: when / otherwise
(
    df
    .select(
        "likes", 
        "dislikes",
        (
            when(df.likes > df.dislikes, "Good")
            .when(df.likes < df.dislikes, "Bad")
            .otherwise("Undertemined")
            .alias("Favorability")
        )
    )
    .where(df.likes > 0)
    .orderBy(df.likes)
    .show(truncate=True)
)

+-----+--------+------------+
|likes|dislikes|Favorability|
+-----+--------+------------+
|    1|       0|        Good|
|    1|       0|        Good|
|    1|       3|         Bad|
|    1|       0|        Good|
|    1|       0|        Good|
|    1|       1|Undertemined|
|    1|       0|        Good|
|    1|       0|        Good|
|    2|       2|Undertemined|
|    2|       0|        Good|
|    2|       0|        Good|
|    2|       0|        Good|
|    2|       0|        Good|
|    2|       4|         Bad|
|    2|       2|Undertemined|
|    2|      14|         Bad|
|    2|       2|Undertemined|
|    2|       4|         Bad|
|    2|       2|Undertemined|
|    2|       2|Undertemined|
+-----+--------+------------+
only showing top 20 rows



In [81]:
# Option 2: with expr

(
    df
    .select(
        "likes",
        "dislikes",
        expr(
            "CASE"
            "  WHEN likes > dislikes THEN 'Good'"
            "  WHEN likes < dislikes THEN 'Bad'"
            "  ELSE                       'Undertemined' "
            "END AS Favorability"
        )
        # You could also use alias() function here
    )
    .where(df.likes > 0)
    .orderBy(df.likes)
    .show()
)

+-----+--------+------------+
|likes|dislikes|Favorability|
+-----+--------+------------+
|    1|       0|        Good|
|    1|       0|        Good|
|    1|       0|        Good|
|    1|       3|         Bad|
|    1|       0|        Good|
|    1|       0|        Good|
|    1|       1|Undertemined|
|    1|       0|        Good|
|    2|       0|        Good|
|    2|      14|         Bad|
|    2|       0|        Good|
|    2|       3|         Bad|
|    2|       0|        Good|
|    2|       4|         Bad|
|    2|       0|        Good|
|    2|       0|        Good|
|    2|       0|        Good|
|    2|       4|         Bad|
|    2|       0|        Good|
|    2|      16|         Bad|
+-----+--------+------------+
only showing top 20 rows



In [82]:
# Option 2: with selectExpr

(
    df
    .selectExpr(
        "likes",
        "dislikes",
        "CASE"
        "  WHEN likes > dislikes THEN 'Good'"
        "  WHEN likes < dislikes THEN 'Bad'"
        "  ELSE                       'Undertemined' "
        "END AS Favorability"
    )
    .where(df.likes > 0)
    .orderBy(df.likes)
    .show()
)

+-----+--------+------------+
|likes|dislikes|Favorability|
+-----+--------+------------+
|    1|       0|        Good|
|    1|       0|        Good|
|    1|       1|Undertemined|
|    1|       0|        Good|
|    1|       3|         Bad|
|    1|       0|        Good|
|    1|       0|        Good|
|    1|       0|        Good|
|    2|       0|        Good|
|    2|       0|        Good|
|    2|       1|        Good|
|    2|       0|        Good|
|    2|       1|        Good|
|    2|       0|        Good|
|    2|       0|        Good|
|    2|       0|        Good|
|    2|       0|        Good|
|    2|       1|        Good|
|    2|       0|        Good|
|    2|       0|        Good|
+-----+--------+------------+
only showing top 20 rows



__Concatenate__

In [84]:
df.select(
    concat_ws(
        " -|- ", df.title, df.channel_title
    ).alias("concatenated")
).show(5, truncate=False)

+----------------------------------------------------------------------------------+
|concatenated                                                                      |
+----------------------------------------------------------------------------------+
|WE WANT TO TALK ABOUT OUR MARRIAGE -|- CaseyNeistat                               |
|The Trump Presidency: Last Week Tonight with John Oliver (HBO) -|- LastWeekTonight|
|Racist Superman | Rudy Mancuso, King Bach & Lele Pons -|- Rudy Mancuso            |
|Nickelback Lyrics: Real or Fake? -|- Good Mythical Morning                        |
|I Dare You: GOING BALD!? -|- nigahiga                                             |
+----------------------------------------------------------------------------------+
only showing top 5 rows



In [87]:
df.select(
    "trending_date", 
    year("trending_date"), 
    month("trending_date")
).show(5)

+-------------+-------------------+--------------------+
|trending_date|year(trending_date)|month(trending_date)|
+-------------+-------------------+--------------------+
|   2017-01-14|               2017|                   1|
|   2017-01-14|               2017|                   1|
|   2017-01-14|               2017|                   1|
|   2017-01-14|               2017|                   1|
|   2017-01-14|               2017|                   1|
+-------------+-------------------+--------------------+
only showing top 5 rows



In [91]:
(
    df.select(
        "trending_date", 
        "publish_time",
        datediff(
            df.publish_time,
            df.trending_date
        ) / 365
    )
    .show(5, truncate=False)
)

+-------------+-------------------+---------------------------------------------+
|trending_date|publish_time       |(datediff(publish_time, trending_date) / 365)|
+-------------+-------------------+---------------------------------------------+
|2017-01-14   |2017-11-13 15:13:01|0.8301369863013699                           |
|2017-01-14   |2017-11-13 05:30:00|0.8301369863013699                           |
|2017-01-14   |2017-11-12 17:05:24|0.8273972602739726                           |
|2017-01-14   |2017-11-13 09:00:04|0.8301369863013699                           |
|2017-01-14   |2017-11-12 16:01:41|0.8273972602739726                           |
+-------------+-------------------+---------------------------------------------+
only showing top 5 rows



In [94]:
array = df.select(
    "title", 
    split(df.title, " ").alias("splited")
)

array.show(5, truncate=False)

+--------------------------------------------------------------+-------------------------------------------------------------------------+
|title                                                         |splited                                                                  |
+--------------------------------------------------------------+-------------------------------------------------------------------------+
|WE WANT TO TALK ABOUT OUR MARRIAGE                            |[WE, WANT, TO, TALK, ABOUT, OUR, MARRIAGE]                               |
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)|[The, Trump, Presidency:, Last, Week, Tonight, with, John, Oliver, (HBO)]|
|Racist Superman | Rudy Mancuso, King Bach & Lele Pons         |[Racist, Superman, |, Rudy, Mancuso,, King, Bach, &, Lele, Pons]         |
|Nickelback Lyrics: Real or Fake?                              |[Nickelback, Lyrics:, Real, or, Fake?]                                   |
|I Dare You: GOING BALD!?  

In [96]:
array.select(
    "title", array_contains(array.splited, "MARRIAGE")
).show(5, truncate=False)

+--------------------------------------------------------------+---------------------------------+
|title                                                         |array_contains(splited, MARRIAGE)|
+--------------------------------------------------------------+---------------------------------+
|WE WANT TO TALK ABOUT OUR MARRIAGE                            |true                             |
|The Trump Presidency: Last Week Tonight with John Oliver (HBO)|false                            |
|Racist Superman | Rudy Mancuso, King Bach & Lele Pons         |false                            |
|Nickelback Lyrics: Real or Fake?                              |false                            |
|I Dare You: GOING BALD!?                                      |false                            |
+--------------------------------------------------------------+---------------------------------+
only showing top 5 rows



In [98]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

In [100]:
def square(x):
    return int(x ** 2)

In [103]:
square_udf = udf(lambda v: square(v), IntegerType())

In [105]:
(
    df.select(
        "dislikes", square_udf("dislikes").alias("square")
    )
    .where(col("dislikes").isNotNull())
    .show(5)
)

+--------+--------+
|dislikes|  square|
+--------+--------+
|    2966| 8797156|
|    6146|37773316|
|    5339|28504921|
|     666|  443556|
|    1989| 3956121|
+--------+--------+
only showing top 5 rows

