In [105]:
import re
import pandas as pd
from requests import get
import nltk
from bs4 import BeautifulSoup 
import sys

In [106]:
tos_url = "http://chakoteya.net/StarTrek/episodes.htm"

response = get(tos_url)

if response.status_code >= 400:
    print("No good")
    sys.exit()

html = response.text

soup = BeautifulSoup(html)
soup

rows = list(soup.select("table tr"))

# Pop removes the first line so we can access the rest of the rows and season names
rows.pop(0).h1.text.split(" -")[0]

'The Star Trek Transcripts'

# Dataframe Format

- first pass: one show per row. `show, season, episode_name, production, airdate, transcript`
- second pass: break out each transcript into its own dataframe of dialogue, 

In [107]:
# show_name = response.url.split("/")[3]
episodes = []

for row in rows:
    output = {}

    # if "Season" in str(row) or "Animated" in str(row):    
    if ("Season" in str(row) and row.find("h2") != None) or "Animated" in str(row):

        output["season"] = row.h2.text.strip()
        continue
    
    # This skips the header and also skips the unnecessary nested tbody included in this HTML
    if "Episode Name" in str(row):
        continue
    
    output["episode_name"] = row.select("td")[0].a.text.strip()
    output["link"] = row.select("td")[0].a["href"]
    output["airdate"] = row.select("td")[2].text


    episodes.append(output)

In [108]:
df = pd.DataFrame(episodes)
df.episode_name = df.episode_name.str.replace(r"\s", " ", regex=True)

In [109]:
df["show_name"] = "Star Trek"
df["series_name"] = "The Original Series"

In [112]:
df.to_csv("original_series.csv", index=False)

In [113]:
df

Unnamed: 0,episode_name,link,airdate,show_name,series_name
0,The Cage,1.htm,unaired,Star Trek,The Original Series
1,The Man Trap,6.htm,"8\r\nSep, 1966",Star Trek,The Original Series
2,Charlie X,8.htm,"15\r\nSep, 1966",Star Trek,The Original Series
3,Where No Man Has Gone Before,2.htm,"22\r\nSep, 1966",Star Trek,The Original Series
4,The Naked Time,7.htm,"29\r\nSep, 1966",Star Trek,The Original Series
...,...,...,...,...,...
97,Bem,TAS018.htm,"14 Sep, 1974",Star Trek,The Original Series
98,The Practical Joker,TAS021.htm,"21 Sep, 1974",Star Trek,The Original Series
99,Albatross,TAS019.htm,"28 Sep, 1974",Star Trek,The Original Series
100,How Sharper Than A Serpent's Tooth,TAS022.htm,"5 Oct, 1974",Star Trek,The Original Series


## Next Generation Transcripts

In [114]:
url = "http://chakoteya.net/NextGen/episodes.htm"

response = get(url)

if response.status_code >= 400:
    print("No good")
    sys.exit()

html = response.text

soup = BeautifulSoup(html)
soup

rows = list(soup.select("table tr"))

# Pop removes the first line so we can access the rest of the rows and season names
rows.pop(0).h1.text.split(" -")[0]

'The Next Generation Transcripts'

In [115]:
episodes = []

for row in rows:
    output = {}
    
    # This skips the header and also skips the unnecessary nested tbody included in this HTML
    if "Episode Name" in str(row):
        continue
        
    if "Season" in str(row) and row.find("h2") != None:
        output["season"] = row.h2.text.strip()
        continue
    
    output["episode_name"] = row.select("td")[0].a.text.strip()
    output["link"] = row.select("td")[0].a["href"]
    output["airdate"] = row.select("td")[2].text

    episodes.append(output)

In [116]:
tng = pd.DataFrame(episodes)
tng["series_name"] = "The Next Generation"
tng["show_name"] = "Star Trek"

In [117]:
tng.to_csv("tng.csv", index=None)

In [118]:
tng

Unnamed: 0,episode_name,link,airdate,series_name,show_name
0,Encounter at Farpoint,101.htm,"28\r\nSept, 1987",The Next Generation,Star Trek
1,The Naked Now,103.htm,"5\r\nOct, 1987",The Next Generation,Star Trek
2,Code of Honour,104.htm,"12\r\nOct, 1987",The Next Generation,Star Trek
3,The Last Outpost,107.htm,"19\r\nOct, 1987",The Next Generation,Star Trek
4,Where No One Has Gone Before,106.htm,26\r\nOct 1987,The Next Generation,Star Trek
...,...,...,...,...,...
171,Firstborn,273.htm,"25 Apr, 1994",The Next Generation,Star Trek
172,Bloodlines,274.htm,"2 May, 1994",The Next Generation,Star Trek
173,Emergence,275.htm,"9 May, 1994",The Next Generation,Star Trek
174,Preemptive Strike,276.htm,"16 May, 1994",The Next Generation,Star Trek


## Deep Space 9

In [119]:
url = "http://chakoteya.net/DS9/episodes.htm"

response = get(url)

if response.status_code >= 400:
    print("No good")
    sys.exit()

html = response.text

soup = BeautifulSoup(html)
soup

rows = list(soup.select("table tr"))

# Pop removes the first line so we can access the rest of the rows and season names
rows.pop(0).h1.text.split(" -")[0]

'The Deep Space Nine Transcripts'

In [126]:
episodes = []

for row in rows:
    output = {}
    
    # This skips the header and also skips the unnecessary nested tbody included in this HTML
    if "Episode Name" in str(row):
        continue
        
    if "Season" in str(row) and row.find("h2") != None:
        output["season"] = row.h2.text.strip()
        continue
    
    if len(row.select("td")) > 1:
        output["episode_name"] = row.select("td")[0].a.text.strip()
        output["link"] = row.select("td")[0].a["href"]
        output["airdate"] = row.select("td")[2].text
        episodes.append(output)


In [127]:
rows[33].select("td")[2]

<td align="center" bgcolor="#eeeeee" valign="top" width="108"> <font face="Arial" size="2">22 Nov, 1993</font></td>

In [128]:
ds9 = pd.DataFrame(episodes)

In [129]:
ds9

Unnamed: 0,episode_name,link,airdate
0,Emissary,401.htm,"3 Jan, 1993"
1,Past Prologue,404.htm,"11 Jan, 1993"
2,A Man Alone,403.htm,"17 Jan, 1993"
3,Babel,405.htm,"25 Jan, 1993"
4,Captive Pursuit,406.htm,"1 Feb, 1993"
...,...,...,...
168,When It Rains...,571.htm,"3 May, 1999"
169,Tacking Into The Wind,572.htm,"10 May, 1999"
170,Extreme Measures,573.htm,"17 May, 1999"
171,The Dogs Of War,574.htm,"24 May, 1999"


In [131]:
ds9.to_csv("ds9.csv", index=None)

## Voyager

In [156]:
url = "http://chakoteya.net/Voyager/episode_listing.htm"

response = get(url)

if response.status_code >= 400:
    print("No good")
    sys.exit()

html = response.text

soup = BeautifulSoup(html)
soup

rows = list(soup.select("table tr"))

# Pop removes the first line so we can access the rest of the rows and season names
rows[0]

<tr>
<td colspan="3" width="100%">
<h1 align="center">The Voyager Transcripts - Episode Listings</h1>
<h2>
<center> Season One</center>
</h2>
<div align="center">
<table bgcolor="#d5d5d5" border="1" cellspacing="4" width="607">
<tbody>
<tr>
<td align="center" bgcolor="#006b9f" valign="top">
<font color="#d5d5d5" face="Arial" size="2"><b>Episode
Name</b></font></td>
<td align="center" bgcolor="#006b9f" valign="top">
<font color="#d5d5d5" face="Arial" size="2"><b>Production</b></font></td>
<td align="center" bgcolor="#006b9f" valign="top">
<font color="#d5d5d5" face="Arial" size="2"><b>Airdate</b></font></td>
</tr>
<tr>
<td align="center" bgcolor="#eeeeee" valign="middle">
<p align="center"> <font face="Arial" size="2"><a href="101.htm">Caretaker</a></font></p>
</td>
<td align="center" bgcolor="#eeeeee" valign="middle"> <font face="Arial" size="2">101
&amp; 102</font></td>
<td align="center" bgcolor="#eeeeee" valign="middle">
<p align="center"><font face="Arial" size="2">16 Jan 1995</fon

In [135]:
episodes = []

# This structure is a bit different since 
for row in rows[0]:
    output = {}
    
    # This skips the header and also skips the unnecessary nested tbody included in this HTML
    if "Episode Name" in str(row):
        continue
        
    if "Season" in str(row) and row.find("h2") != None:
        output["season"] = row.h2.text.strip()
        continue
    
    print(row)
    if len(row.select("td")) > 1:
        output["episode_name"] = row.select("td")[0].a.text.strip()
        output["link"] = row.select("td")[0].a["href"]
        output["airdate"] = row.select("td")[2].text
        episodes.append(output)


<tr>
<td align="center" bgcolor="#006b9f" valign="top">
<font color="#d5d5d5" face="Arial" size="2"><b>Episode
Name</b></font></td>
<td align="center" bgcolor="#006b9f" valign="top">
<font color="#d5d5d5" face="Arial" size="2"><b>Production</b></font></td>
<td align="center" bgcolor="#006b9f" valign="top">
<font color="#d5d5d5" face="Arial" size="2"><b>Airdate</b></font></td>
</tr>


AttributeError: 'NoneType' object has no attribute 'text'