In [1]:
import sqlite3
import numpy as np
import pandas as pd
import tensorflow as tf
import re

In [2]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect("database.sqlite")
reviews = pd.read_sql_query("SELECT * from reviews", con)

# Verify that result of SQL query is stored in the dataframe
reviews.head()

Unnamed: 0,podcast_id,title,content,rating,author_id,created_at
0,c61aa81c9b929a66f0c1db6cbe5d8548,really interesting!,Thanks for providing these insights. Really e...,5,F7E5A318989779D,2018-04-24T12:05:16-07:00
1,c61aa81c9b929a66f0c1db6cbe5d8548,Must listen for anyone interested in the arts!!!,Super excited to see this podcast grow. So man...,5,F6BF5472689BD12,2018-05-09T18:14:32-07:00
2,ad4f2bf69c72b8db75978423c25f379e,nauseatingly left,"I'm a liberal myself, but its pretty obvious a...",1,1AB95B8E6E1309E,2019-06-11T14:53:39-07:00
3,ad4f2bf69c72b8db75978423c25f379e,Diverse stories,I find Tedx talks very inspirational but I oft...,5,11BB760AA5DEBD1,2018-05-31T13:08:09-07:00
4,ad4f2bf69c72b8db75978423c25f379e,👍👍👍👍,"I love this podcast, it is so good.",5,D86032C8E57D15A,2019-06-19T13:56:05-07:00


In [3]:
reviews.shape

(2004000, 6)

In [4]:
podcasts = pd.read_sql_query("SELECT * from podcasts", con)

# Verify that result of SQL query is stored in the dataframe
podcasts.rename(columns={'title': 'podcast_title'}, inplace = True)
podcasts.head()

Unnamed: 0,podcast_id,itunes_id,slug,itunes_url,podcast_title
0,a00018b54eb342567c94dacfb2a3e504,1313466221,scaling-global,https://podcasts.apple.com/us/podcast/scaling-...,Scaling Global
1,a00043d34e734b09246d17dc5d56f63c,158973461,cornerstone-baptist-church-of-orlando,https://podcasts.apple.com/us/podcast/cornerst...,Cornerstone Baptist Church of Orlando
2,a0004b1ef445af9dc84dad1e7821b1e3,139076942,mystery-dancing-in-the-dark,https://podcasts.apple.com/us/podcast/mystery-...,Mystery: Dancing in the Dark
3,a00071f9aaae9ac725c3a586701abf4d,1332508972,kts-money-matters,https://podcasts.apple.com/us/podcast/kts-mone...,KTs Money Matters
4,a000aa69852b276565c4f5eb9cdd999b,1342447811,speedway-soccer,https://podcasts.apple.com/us/podcast/speedway...,Speedway Soccer


In [5]:
podcasts.shape

(109216, 5)

In [6]:
categories = pd.read_sql_query("SELECT * from categories", con)

# Verify that result of SQL query is stored in the dataframe
categories.head()

Unnamed: 0,podcast_id,category
0,c61aa81c9b929a66f0c1db6cbe5d8548,arts
1,c61aa81c9b929a66f0c1db6cbe5d8548,arts-performing-arts
2,c61aa81c9b929a66f0c1db6cbe5d8548,music
3,ad4f2bf69c72b8db75978423c25f379e,arts
4,ad4f2bf69c72b8db75978423c25f379e,arts-design


In [7]:
categories.shape

(211243, 2)

In [8]:
def get_main_cat(line):
    line = line.replace(line, re.findall(r'\b([a-zA-Z]+)\b', line)[0])
    
    return line

In [9]:
categories['category'] = categories['category'].apply(lambda x: get_main_cat(x))
categories.head()

Unnamed: 0,podcast_id,category
0,c61aa81c9b929a66f0c1db6cbe5d8548,arts
1,c61aa81c9b929a66f0c1db6cbe5d8548,arts
2,c61aa81c9b929a66f0c1db6cbe5d8548,music
3,ad4f2bf69c72b8db75978423c25f379e,arts
4,ad4f2bf69c72b8db75978423c25f379e,arts


In [10]:
categories['category'].unique()

array(['arts', 'music', 'education', 'society', 'leisure', 'technology',
       'fiction', 'true', 'health', 'history', 'comedy', 'tv', 'kids',
       'religion', 'business', 'news', 'spirituality', 'science',
       'christianity', 'government', 'sports', 'hinduism', 'judaism',
       'islam', 'buddhism'], dtype=object)

In [11]:
def replace_cat(line):
    line = re.sub(r'\b(true)\b', 'crime', line)
    line = re.sub(r'\b(christianity|hinduism|judaism|islam|buddhism|spirituality)\b', 'religion', line)
    return line

In [12]:
categories['category'] = categories['category'].apply(lambda x: replace_cat(x))
categories.head()

Unnamed: 0,podcast_id,category
0,c61aa81c9b929a66f0c1db6cbe5d8548,arts
1,c61aa81c9b929a66f0c1db6cbe5d8548,arts
2,c61aa81c9b929a66f0c1db6cbe5d8548,music
3,ad4f2bf69c72b8db75978423c25f379e,arts
4,ad4f2bf69c72b8db75978423c25f379e,arts


In [13]:
categories.drop_duplicates(subset='podcast_id', keep="first", inplace = True)
categories.shape

(109216, 2)

In [14]:
df = pd.merge(reviews, podcasts, on=['podcast_id'], how = 'inner')
df = pd.merge(df, categories, on=['podcast_id'], how = 'inner')
df.shape

(1979519, 11)

In [15]:
df

Unnamed: 0,podcast_id,title,content,rating,author_id,created_at,itunes_id,slug,itunes_url,podcast_title,category
0,c61aa81c9b929a66f0c1db6cbe5d8548,really interesting!,Thanks for providing these insights. Really e...,5,F7E5A318989779D,2018-04-24T12:05:16-07:00,1373261997,backstage-at-tilles-center,https://podcasts.apple.com/us/podcast/backstag...,Backstage at Tilles Center,arts
1,c61aa81c9b929a66f0c1db6cbe5d8548,Must listen for anyone interested in the arts!!!,Super excited to see this podcast grow. So man...,5,F6BF5472689BD12,2018-05-09T18:14:32-07:00,1373261997,backstage-at-tilles-center,https://podcasts.apple.com/us/podcast/backstag...,Backstage at Tilles Center,arts
2,ad4f2bf69c72b8db75978423c25f379e,nauseatingly left,"I'm a liberal myself, but its pretty obvious a...",1,1AB95B8E6E1309E,2019-06-11T14:53:39-07:00,160904630,ted-talks-daily,https://podcasts.apple.com/us/podcast/ted-talk...,TED Talks Daily,arts
3,ad4f2bf69c72b8db75978423c25f379e,Diverse stories,I find Tedx talks very inspirational but I oft...,5,11BB760AA5DEBD1,2018-05-31T13:08:09-07:00,160904630,ted-talks-daily,https://podcasts.apple.com/us/podcast/ted-talk...,TED Talks Daily,arts
4,ad4f2bf69c72b8db75978423c25f379e,👍👍👍👍,"I love this podcast, it is so good.",5,D86032C8E57D15A,2019-06-19T13:56:05-07:00,160904630,ted-talks-daily,https://podcasts.apple.com/us/podcast/ted-talk...,TED Talks Daily,arts
...,...,...,...,...,...,...,...,...,...,...,...
1979514,deeee82605b10fd53225121291fb533a,"Great Coffee, People, and Podcast","On top of incredible coffee, these guys are pu...",5,6068D1F74D4CC1F,2021-09-02T14:13:40-07:00,1572279346,the-backwoods-grind-podcast,https://podcasts.apple.com/us/podcast/the-back...,The Backwoods Grind Podcast,sports
1979515,deeee82605b10fd53225121291fb533a,The first of many,Great job guys! Hope you see great success.,5,0179E9398E85CA1,2021-06-15T20:30:23-07:00,1572279346,the-backwoods-grind-podcast,https://podcasts.apple.com/us/podcast/the-back...,The Backwoods Grind Podcast,sports
1979516,ee05f560fe1defe37b673f67c020b1bb,love the podcast,just wish it was a little longer like 60 mins ...,5,8585EA0C5A446EC,2021-01-13T21:25:21-07:00,1477693877,the-curbside-podcast,https://podcasts.apple.com/us/podcast/the-curb...,The Curbside Podcast,leisure
1979517,fb98841a677ab948b1bc90900ba8d0d8,Just Great!,This is probably a weekly highlight for me! Lo...,5,CF99C44A93FC389,2020-09-24T07:15:57-07:00,1529801099,tlg-roundtable,https://podcasts.apple.com/us/podcast/tlg-roun...,TLG Roundtable,sports


In [16]:
df['category'].unique()

array(['arts', 'society', 'fiction', 'history', 'comedy', 'music',
       'leisure', 'education', 'kids', 'religion', 'business', 'news',
       'tv', 'health', 'sports', 'science', 'technology', 'government',
       'crime'], dtype=object)

In [18]:
df.to_csv('podcast_final.csv')

In [23]:
df_sample = df.sample(200000).reset_index(drop = True)

In [24]:
df_sample

Unnamed: 0,podcast_id,title,content,rating,author_id,created_at,itunes_id,slug,itunes_url,podcast_title,category
0,eb8be8dd261e2c7ad359fc1100e00d36,Wow,I’m just speechless...Listing to this podcast ...,5,354021BBAD70826,2021-05-26T08:36:35-07:00,1492139540,effective-compassion,https://podcasts.apple.com/us/podcast/effectiv...,Effective Compassion,news
1,ff4c45ee931ee137e635bf25cf812603,Erin A Titus,Total Irish American from my da. Love it,5,EEBAB0E98722505,2022-05-01T19:30:27-07:00,363368392,irish-history-podcast,https://podcasts.apple.com/us/podcast/irish-hi...,Irish History Podcast,history
2,e87cebeb33762a0fd09345bf468f6494,5 stars not for youtubes,5 stARS FOR my Boi EaThilaN kLienybOI\nRead yo...,5,2EC77A5CC218BC8,2019-07-31T13:17:16-07:00,1186098620,h3-podcast,https://podcasts.apple.com/us/podcast/h3-podca...,H3 Podcast,comedy
3,e0b112db2630c309e68c76259d6f3dd8,Wonderfully funny!,Definitely a great podcast to listen to!,5,B871176FE7993B1,2020-08-05T18:45:46-07:00,1522764294,disc-only,https://podcasts.apple.com/us/podcast/disc-onl...,Disc Only,comedy
4,ac6835b75edb8475163934ae40ea9f50,Holy F***ing S*** !,This podcast is funny. I like listening to it ...,5,C9A897B4EDAA221,2020-10-28T14:20:20-07:00,1504297576,up-close-and-personal,https://podcasts.apple.com/us/podcast/up-close...,Up Close and Personal,comedy
...,...,...,...,...,...,...,...,...,...,...,...
199995,e7be0b6e726ac3f4cb1545bdf48830e8,Create your own luck,The interview is excellent and really helps yo...,5,328368602D3E5A4,2019-12-17T18:24:48-07:00,1487739752,she-finds-joy,https://podcasts.apple.com/us/podcast/she-find...,She Finds Joy,business
199996,e18101f446800acac315bc8c9cc088fd,Like hanging out with a friend!,Listening to this podcast is like catching up ...,5,C622C2BB8366912,2022-06-22T09:51:25-07:00,1586672755,beck-and-call,https://podcasts.apple.com/us/podcast/beck-and...,Beck and Call,society
199997,eff0d55df3b0d1ec61b6fb87faac2398,Keep them coming!,They do their research and keep listeners enga...,5,92890FC763BB734,2019-02-21T07:25:15-07:00,1451109634,throughline,https://podcasts.apple.com/us/podcast/throughl...,Throughline,history
199998,d283aab5fd78b2608a87094381eb7b40,Erwin little,Great interview. Down to earth conversation. N...,5,7F421CDDAFDA7D9,2019-04-29T17:16:10-07:00,1460623879,tmi-with-teresa,https://podcasts.apple.com/us/podcast/tmi-with...,TMI with Teresa,society


In [25]:
df_sample.to_csv('podcast_sample.csv')