# Part 1: Data Gathering
- Data source comes from: https://fangj.github.io/friends
1. Build Common Functions
2. Build the DataFrame
3. Do a basic cleaning of the DataFrame
4. Preprocessing the data for NLP 

In [156]:
from bs4 import BeautifulSoup as BS
import requests
import pandas as pd
import re
import numpy as np
import random
import string

## 1. Commonly Used Functions
***Note:*** 
- Season 10 had a slightly different html format where it used a strong tag instead of bold. Therefore, I had to make a new function to pick that up since I had to make sure to cover the final season!

In [93]:
def build_df(url):
    '''
    Takes the url and parses through the text to extract the episode, season, speaker, dialogue and writers. 
    This outputs a built dataframe
    '''
    #Use BS to scrape the website
    response  = requests.get(url)
    soup = BS(response.content, 'html.parser')
    ps = soup.find_all('p')
    
    #Parse through the souped object and build the dataframe
    writers = soup.find(text=re.compile('Written'))
    speaker = []
    dialogue = []
    d = pd.DataFrame(columns=['episode', 'season','speaker', 'dialogue', 'writers'])
    for i in ps:
        if i.find('b'):
            speaker.append(i.find('b').text.strip(':'))
            dialogue.append(i.find('b').next_sibling)
    d['speaker'] = speaker
    d['dialogue'] = dialogue
    d['season'] = re.findall('season/(.*).html', url)[0]
    d['episode'] = soup.find('title').text
    d['writers'] = writers.split(':')[1]
    return d

In [94]:
def build_df_s10(url):
    '''
    Function is only used for season 10 due to the difference in the html layout. 
    Takes the url and parses through the text to extract the episode, season, speaker, dialogue and writers. 
    This outputs a built dataframe
    '''
    #Use BS to scrape the website
    response  = requests.get(url)
    soup = BS(response.content, 'html.parser')
    ps = soup.find_all('p')
    
    #Parse through the souped object and build the dataframe
    writers = soup.find(text=re.compile('Written'))
    speaker = []
    dialogue = []
    d = pd.DataFrame(columns=['episode', 'season','speaker', 'dialogue', 'writers'])
    for i in ps:
        if i.find('strong'):
            speaker.append(i.find('strong').text.strip(':'))
            dialogue.append(i.find('strong').next_sibling)
    d['speaker'] = speaker
    d['dialogue'] = dialogue
    d['season'] = re.findall('season/(.*).html', url)[0]
    d['episode'] = soup.find('title').text
    d['writers'] = writers.split(':')[1]
    return d

In [144]:
def clean_df(soup_df):
    '''
    - Takes a dataframe and builds a few extra columns: Before speaker, After speaker, season number, 
    and episode number. 
    - It also converts the dialogue column to a string and drops rows with Null values. 
    '''
    soup_df.dropna(how='any', inplace=True)
    soup_df['before_speaker'] = soup_df['speaker'].shift(periods=1)
    soup_df['after_speaker'] = soup_df['speaker'].shift(periods=-1)
    soup_df['dialogue'] = soup_df['dialogue'].astype(str)
    soup_df['season_number'] = soup_df['season'].apply(lambda x: int(x[:2]))
    soup_df['episode_number'] = soup_df['season'].apply(lambda x: int(x[-2:]))
    soup_df = soup_df[soup_df['season_number']!=2]
    soup_df['sentence_length'] = soup_df['dialogue'].apply(lambda x: len(x.split(' ')))
    cols = ['episode', 'season_number', 'episode_number', 'speaker', 'dialogue', 'writers', 'before_speaker',
       'after_speaker', 'sentence_length']
    soup_df = soup_df[cols]
    return soup_df

## 2. Build the Dataframe from every episode
- It is clear from the scraping that not every website was able to work for whatever reason.  This is okay since a I was able to get over 25,000 lines of dialogue which is plenty

In [96]:
seasons = ['01', '02', '03', '04', '05', '06', '07', '08','09', '10']
episodes = ['01', '02', '03', '04', '05', '06', '07', '08','09', '10', '11', '12', '13', '14', '15', 
            '16', '17', '18', '19', '20', '21', '22', '23', '24']
a = random.sample(episodes, 8)

In [115]:
all_data = []
missed_episodes = []
for s in seasons: 
    for e in episodes:
        if s != '10':
            try: 
                all_data.append(build_df(f'https://fangj.github.io/friends/season/{s}{e}.html'))
                print(f'https://fangj.github.io/friends/season/{s}{e}.html')
            except: 
                print("BAD", f'https://fangj.github.io/friends/season/{s}{e}.html')
                missed_episodes.append(f'https://fangj.github.io/friends/season/{s}{e}.html')
                pass
        else: 
            try: 
                all_data.append(build_df_s10(f'https://fangj.github.io/friends/season/{s}{e}.html'))
                print(f'https://fangj.github.io/friends/season/{s}{e}.html')
            except: 
                print("BAD", f'https://fangj.github.io/friends/season/{s}{e}.html')
                missed_episodes.append(f'https://fangj.github.io/friends/season/{s}{e}.html')
                pass

https://fangj.github.io/friends/season/0101.html
https://fangj.github.io/friends/season/0102.html
https://fangj.github.io/friends/season/0103.html
https://fangj.github.io/friends/season/0104.html
https://fangj.github.io/friends/season/0105.html
https://fangj.github.io/friends/season/0106.html
https://fangj.github.io/friends/season/0107.html
https://fangj.github.io/friends/season/0108.html
https://fangj.github.io/friends/season/0109.html
https://fangj.github.io/friends/season/0110.html
https://fangj.github.io/friends/season/0111.html
https://fangj.github.io/friends/season/0112.html
https://fangj.github.io/friends/season/0113.html
https://fangj.github.io/friends/season/0114.html
https://fangj.github.io/friends/season/0115.html
https://fangj.github.io/friends/season/0116.html
https://fangj.github.io/friends/season/0117.html
https://fangj.github.io/friends/season/0118.html
https://fangj.github.io/friends/season/0119.html
https://fangj.github.io/friends/season/0120.html
https://fangj.github

https://fangj.github.io/friends/season/0721.html
BAD https://fangj.github.io/friends/season/0722.html
https://fangj.github.io/friends/season/0723.html
BAD https://fangj.github.io/friends/season/0724.html
https://fangj.github.io/friends/season/0801.html
https://fangj.github.io/friends/season/0802.html
https://fangj.github.io/friends/season/0803.html
https://fangj.github.io/friends/season/0804.html
https://fangj.github.io/friends/season/0805.html
https://fangj.github.io/friends/season/0806.html
https://fangj.github.io/friends/season/0807.html
https://fangj.github.io/friends/season/0808.html
https://fangj.github.io/friends/season/0809.html
BAD https://fangj.github.io/friends/season/0810.html
https://fangj.github.io/friends/season/0811.html
https://fangj.github.io/friends/season/0812.html
https://fangj.github.io/friends/season/0813.html
https://fangj.github.io/friends/season/0814.html
https://fangj.github.io/friends/season/0815.html
https://fangj.github.io/friends/season/0816.html
BAD http

In [118]:
all_data_df = pd.concat(all_data)

## 3. Cleaning the DataFrame

In [145]:
clean_data = clean_df(all_data_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  soup_df['sentence_length'] = soup_df['dialogue'].apply(lambda x: len(x.split(' ')))


In [146]:
clean_data.groupby('speaker').count().sort_values(by='writers', ascending = False).head(10)

Unnamed: 0_level_0,episode,season_number,episode_number,dialogue,writers,before_speaker,after_speaker,sentence_length
speaker,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Rachel,5766,5766,5766,5766,5766,5766,5766,5766
Ross,5651,5651,5651,5651,5651,5651,5651,5651
Chandler,5329,5329,5329,5329,5329,5329,5329,5329
Monica,5271,5271,5271,5271,5271,5271,5271,5271
Joey,5057,5057,5057,5057,5057,5057,5056,5057
Phoebe,4496,4496,4496,4496,4496,4495,4496,4496
All,204,204,204,204,204,204,204,204
Janice,168,168,168,168,168,168,168,168
Frank,124,124,124,124,124,124,124,124
Richard,123,123,123,123,123,123,123,123


In [147]:
mains = ['Ross', 'Rachel', 'Chandler', 'Joey', 'Phoebe', 'Monica']

In [148]:
clean_data = clean_data[clean_data['speaker'].isin(mains)]

## 4. Preprocessing

#### A. Remove words within parenthesis which indicate stage directions

In [150]:
parens = lambda x: re.sub("[\(\[].*?[\)\]]", "", x)
clean_data['dial_clean'] = clean_data['dialogue'].map(parens)


#### B. Remove any rows with a sentence length where word count is 1. Wanted to get rid of 1 word responses

In [154]:
clean_data = clean_data[clean_data['sentence_length']> 1]

#### C. Keep only the alphanumeric characters and also make everything lowercase

In [157]:
alphanumeric = lambda x: re.sub('\w*\d\w*', ' ', x) # Removes any non-alpha-numeric thing
punc_lower = lambda x: re.sub('[%s]' % re.escape(string.punctuation), ' ', x.lower()) #Lowercases everything and removes punctuation

clean_data['dial_clean'] = clean_data['dial_clean'].map(alphanumeric).map(punc_lower)
clean_data.reset_index(inplace=True)

In [159]:
clean_data.head(5)

Unnamed: 0,index,episode,season_number,episode_number,speaker,dialogue,writers,before_speaker,after_speaker,sentence_length,dial_clean
0,3,The One Where Monica Gets a New Roomate (The P...,1,1,Phoebe,"Wait, does he eat chalk?",Marta Kauffman & David Crane,,Phoebe,6,wait does he eat chalk
1,4,The One Where Monica Gets a New Roomate (The P...,1,1,Phoebe,"Just, 'cause, I don't want her to go through\...",Marta Kauffman & David Crane,Phoebe,Monica,16,just cause i don t want her to go through\...
2,5,The One Where Monica Gets a New Roomate (The P...,1,1,Monica,"Okay, everybody relax. This is not even a\nda...",Marta Kauffman & David Crane,Phoebe,Chandler,21,okay everybody relax this is not even a\nda...
3,6,The One Where Monica Gets a New Roomate (The P...,1,1,Chandler,Sounds like a date to me.,Marta Kauffman & David Crane,Monica,Chandler,7,sounds like a date to me
4,7,The One Where Monica Gets a New Roomate (The P...,1,1,Chandler,"Alright, so I'm back in high school, I'm\nsta...",Marta Kauffman & David Crane,Chandler,Chandler,22,alright so i m back in high school i m\nsta...


In [161]:
clean_data.to_pickle('initial_test_dataframe.pkl')