In [78]:
import pandas as pd
from datetime import datetime
import re

In [93]:
def open_file(filename):
    try:
        f = open(filename, "r")
        text = f.read()
    except:
        f = open(filename, "r",encoding='latin-1')
        text = f.read()
        # encoding='latin-1'
    lines = text.split('\n')
    return lines


def clean_name(name: str) -> str:
    char_add_ons = ['V.O.', "'S COM VOICE",
                    'Cont\'d', 'O.S.', "'S", "VOICE", "INTERCOM"]
    for txt in char_add_ons:
        if txt in name:
            name = name.replace(txt, '')
    return re.sub("[\(\[].*?[\)\]]", "", name.strip())


def get_title(lines):
    series_ind = 0
    for i in range(len(lines)):
        if "STAR TREK: THE NEXT GENERATION" in lines[i] or 'STAR TREK: DEEP SPACE NINE' in lines[i]:
            series_ind = i
            break
    return str(lines[series_ind+2]).strip().replace('"','')


def get_date(lines):
    draft_ind = 0
    for i in range(len(lines)):
        if "FINAL DRAFT" in lines[i]:
            series_ind = i
            break
    try:
        date = str(lines[series_ind+2])
        date = date.strip()
        final = datetime.strptime(date, '%B %d, %Y')
    except:
        print(lines[series_ind+2])
        final = "AHHH"
    return final



In [80]:
def get_quotes(lines):
#    characters = [x for x in lines if '\t\t\t\t\t' in x]
    char_index = [i for i in range(len(lines)) if '\t\t\t\t\t' in lines[i]]
    scenes_index = [j for j in range(len(lines)) if 'INT.' in lines[j] or 'EXT.' in lines[j]]

    data = []

    for i in char_index:
        name = lines[i].replace('\t\t\t\t\t', '')
        if "FADE OUT" in name:
            continue

        j = i+1
        next_line = lines[j]

        quote = ""

        while next_line != '':
            next_line = next_line.replace('\t', ' ')
            quote += re.sub("[\(\[].*?[\)\]]", "", next_line)
            j = j+1
            next_line = lines[j]

        scene = lines[max([x for x in scenes_index if x < i])]

        data.append([name, quote,scene])

    test_df = pd.DataFrame(data, columns=['character', 'quote','scene'])
    return (test_df)

def clean_location(loc:str):
    views = ['INT.','EXT.']
    output = loc
    for v in views:
        if v in output:
            output = output[output.index(v)+5:]
    if '(' in output:
        output = output[:output.index('(')-1]
    elif '-' in output:
        output = output[:output.index('-')-1]
    return(output)

def get_view(loc:str):
    output = None
    views = ['INT.','EXT.']
    for v in views:
        if v in loc:
            output=v
    return(output)


def create_df(filepath: str) -> pd.DataFrame:
    lines = open_file(filepath)
    test_df = get_quotes(lines)
    test_df['location']=test_df['scene'].apply(clean_location)
    test_df['view']=test_df['scene'].apply(get_view)
    test_df['character'] = test_df['character'].apply(clean_name)
    test_df['episode'] = get_title(lines)
    test_df['date']=get_date(lines)
    return test_df


In [94]:
x=create_df('scripts_ds9/470.txt')
x

Unnamed: 0,character,quote,scene,location,view,episode,date
0,SISKO,"Got you this time, Chief.",1 INT. QUARK'S,QUARK'S,INT.,Shakaar,1995-03-16
1,O'BRIEN,"Sorry, Commander.",1 INT. QUARK'S,QUARK'S,INT.,Shakaar,1995-03-16
2,SISKO,"Don't apologize, that just makes it worse.",1 INT. QUARK'S,QUARK'S,INT.,Shakaar,1995-03-16
3,O'BRIEN,I guess I'm in the zone today.,1 INT. QUARK'S,QUARK'S,INT.,Shakaar,1995-03-16
4,SISKO,The zone?,1 INT. QUARK'S,QUARK'S,INT.,Shakaar,1995-03-16
...,...,...,...,...,...,...,...
388,KIRA,Oooh. You know... I used to enjoy sleep...,57 INT. KIRA'S QUARTERS,KIRA'S QUARTERS,INT.,Shakaar,1995-03-16
389,SISKO,"It's good to have you back, Major.",57 INT. KIRA'S QUARTERS,KIRA'S QUARTERS,INT.,Shakaar,1995-03-16
390,KIRA,"It's good to be back, sir.",57 INT. KIRA'S QUARTERS,KIRA'S QUARTERS,INT.,Shakaar,1995-03-16
391,SISKO,I'll see you in Ops.,57 INT. KIRA'S QUARTERS,KIRA'S QUARTERS,INT.,Shakaar,1995-03-16


In [82]:
lines = open_file('scripts_ds9/408.txt')
get_date(lines)

datetime.datetime(1992, 11, 17, 0, 0)

In [85]:
from os import listdir
from os.path import isfile, join
mypath = 'scripts_ds9/'
onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
onlyfiles

['504.txt',
 '510.txt',
 '538.txt',
 '470.txt',
 '464.txt',
 '458.txt',
 '459.txt',
 '465.txt',
 '471.txt',
 '539.txt',
 '511.txt',
 '505.txt',
 '513.txt',
 '507.txt',
 '498.txt',
 '467.txt',
 '472.txt',
 '466.txt',
 '499.txt',
 '506.txt',
 '512.txt',
 '516.txt',
 '502.txt',
 '489.txt',
 '462.txt',
 '476.txt',
 '477.txt',
 '463.txt',
 '488.txt',
 '503.txt',
 '517.txt',
 '529.txt',
 '501.txt',
 '515.txt',
 '449.txt',
 '475.txt',
 '461.txt',
 '460.txt',
 '474.txt',
 '448.txt',
 '514.txt',
 '500.txt',
 '528.txt',
 '567.txt',
 '573.txt',
 '413.txt',
 '407.txt',
 '406.txt',
 '412.txt',
 '572.txt',
 '566.txt',
 '570.txt',
 '564.txt',
 '558.txt',
 '404.txt',
 '410.txt',
 '438.txt',
 '439.txt',
 '411.txt',
 '405.txt',
 '559.txt',
 '565.txt',
 '571.txt',
 '549.txt',
 '575.txt',
 '561.txt',
 '429.txt',
 '415.txt',
 '414.txt',
 '428.txt',
 '560.txt',
 '574.txt',
 '548.txt',
 '562.txt',
 '416.txt',
 '402.txt',
 '403.txt',
 '417.txt',
 '563.txt',
 '546.txt',
 '552.txt',
 '432.txt',
 '426.txt',
 '42