# TV-Shows data anaytics

In [0]:
from pyspark.sql.functions import explode, col, desc

In [0]:
sqlContext

Out[269]: <pyspark.sql.context.SQLContext at 0x7f1c6916de50>

In [0]:
file_path = "/FileStore/tables/tv_shows.json"
data = sqlContext.read.json(file_path)
data.take(3)


Out[270]: [Row(_links=Row(nextepisode=None, previousepisode=Row(href='https://api.tvmaze.com/episodes/1051658'), self=Row(href='https://api.tvmaze.com/shows/250')), averageRuntime=30, dvdCountry=None, ended='2017-02-02', externals=Row(imdb='tt3544772', thetvdb=278449, tvrage=37394), genres=['Comedy'], id=250, image=Row(medium='https://static.tvmaze.com/uploads/images/medium_portrait/1/4600.jpg', original='https://static.tvmaze.com/uploads/images/original_untouched/1/4600.jpg'), language='English', name='Kirby Buckets', network=Row(country=Row(code='US', name='United States', timezone='America/New_York'), id=25, name='Disney XD'), officialSite='http://disneyxd.disney.com/kirby-buckets', premiered='2014-10-20', rating=Row(average=None), runtime=30, schedule=Row(days=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday'], time='07:00'), status='Ended', summary="<p>The single-camera series that mixes live-action and animation stars Jacob Bertrand as the title character. <b>Kirby Buckets<

In [0]:
data.dtypes

Out[271]: [('_links',
  'struct<nextepisode:struct<href:string>,previousepisode:struct<href:string>,self:struct<href:string>>'),
 ('averageRuntime', 'bigint'),
 ('dvdCountry', 'struct<code:string,name:string,timezone:string>'),
 ('ended', 'string'),
 ('externals', 'struct<imdb:string,thetvdb:bigint,tvrage:bigint>'),
 ('genres', 'array<string>'),
 ('id', 'bigint'),
 ('image', 'struct<medium:string,original:string>'),
 ('language', 'string'),
 ('name', 'string'),
 ('network',
  'struct<country:struct<code:string,name:string,timezone:string>,id:bigint,name:string>'),
 ('officialSite', 'string'),
 ('premiered', 'string'),
 ('rating', 'struct<average:double>'),
 ('runtime', 'bigint'),
 ('schedule', 'struct<days:array<string>,time:string>'),
 ('status', 'string'),
 ('summary', 'string'),
 ('type', 'string'),
 ('updated', 'bigint'),
 ('url', 'string'),
 ('webChannel',
  'struct<country:struct<code:string,name:string,timezone:string>,id:bigint,name:string>'),
 ('weight', 'bigint')]

In [0]:
data.printSchema

Out[272]: <bound method DataFrame.printSchema of DataFrame[_links: struct<nextepisode:struct<href:string>,previousepisode:struct<href:string>,self:struct<href:string>>, averageRuntime: bigint, dvdCountry: struct<code:string,name:string,timezone:string>, ended: string, externals: struct<imdb:string,thetvdb:bigint,tvrage:bigint>, genres: array<string>, id: bigint, image: struct<medium:string,original:string>, language: string, name: string, network: struct<country:struct<code:string,name:string,timezone:string>,id:bigint,name:string>, officialSite: string, premiered: string, rating: struct<average:double>, runtime: bigint, schedule: struct<days:array<string>,time:string>, status: string, summary: string, type: string, updated: bigint, url: string, webChannel: struct<country:struct<code:string,name:string,timezone:string>,id:bigint,name:string>, weight: bigint]>

In [0]:
display(data.select("name").limit(10))

name
Kirby Buckets
Downton Abbey
Girl Meets World
Hell's Kitchen
World Series of Poker
Anthony Bourdain: Parts Unknown
Comic Book Men
Key & Peele
Glue
Southern Justice


#### Count number of shows in each language

In [0]:
# Filter out rows where 'language' column is not null
filtered_data = data.filter(col('language').isNotNull())

# Group by 'language' column and count occurrences, then sort in descending order and limit to 7 rows
sorted_data = filtered_data.groupBy('language').count().orderBy(desc('count')).limit(7)

# Display the sorted and limited data
display(sorted_data)

language,count
English,32728
Japanese,3998
Russian,3986
Korean,2457
Dutch,1589
Chinese,1443
French,1288


In [0]:
data.createOrReplaceTempView("tv_shows")

In [0]:
genres_data = spark.sql("""
          SELECT DISTINCT genres
          FROM tv_shows
         """)

In [0]:
# Assuming `genres_data` is your DataFrame containing the column of arrays
distinct_genres = genres_data.select(explode("genres").alias("genre")).distinct()

# Show the distinct genres
distinct_genres.show()


+---------------+
|          genre|
+---------------+
|          Crime|
|        Romance|
|       Thriller|
|           Food|
|      Adventure|
|          Drama|
|         Travel|
|Science-Fiction|
|         Family|
|        Fantasy|
|          Legal|
|        History|
|        Mystery|
|   Supernatural|
|          Anime|
|      Espionage|
|         Horror|
|        Medical|
|         Comedy|
|       Children|
+---------------+
only showing top 20 rows

