## Preprocess the data

In [1]:
from nltk.tokenize import WordPunctTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
import re
from pattern.en import pluralize
import pandas as pd
import json
import nltk 
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/sara/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/sara/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /home/sara/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
pattern_biography = r'(?s)(?:Biography|Biography )\=\=(.*?)\=\='
pattern_cast = r'(?s)Casting and character development\=\=(.*?)\=\='
patter_character_information_coworker = r'(?s)Character information \=\=(.*?)Coworker Relations\=\='
pattern_season = r'(?s)(?:Season [0-9]|Season [0-9] | Season [0-9]\: )\=\=(.*?)(?:\[\[Ca|\=\=)'
pattern_history = r'(?s)(?:Character history|History|Character)\=\='
pattern_character_information = r'(?s)(?:Background |Overview|Character information|Character Information |Character profile|Character overview)\=\=(.*?)\=\='
pattern_text_only = r'(?s)\]\](.*?)\=\=(?:Trivia)'
pattern_text_only_2 = r'(?s)\}\}(.*?)(?:\[\[Category)'
pattern_text_only_3 = r'(?s)(.*?)(?:\[\[Category.*?\]\])'

pattern1 = r"(?s)\[\[(.*?)(?:\|.*?)?\]\]"                 # links
pattern2 = r"(?s)\{\{(?:cite)(?:.*?)\}\}"               # Cite patterns

pattern3 = r"(?s)\=\=\=(.*?)\=\=\="                              # === some text ===
pattern4 = r"(?s)\{\{(?:Main)\|(.*?)\}\}"                            # Main patterns
pattern5 = r"(?s)(?:The Office \(US TV series|The Office US|The Office \(UK\)|The Office \(US TV series\)|The Office \(U.S. TV series\)|The Office)" # remove the show name                   # Main patterns
pattern6 = r"(?s)(?:TV)" # remove the word tv               
pattern7 = r"(?s)\<.*?\>(?:.*?)\<\/.*?\>" # remove ref             
pattern8 = r"(?s)(?:url.*?\}\}|\|)" # remove urls in https form             
pattern9 = r"(?s)(?:\[\[The Office \(UK\)\|.*?)\]\]" # remove more the office           
pattern10 = r"(?s)(?:\[https.*?)\]" # remove more urls           


def process_text(name):
#     print(f'reading {name}.txt')
    res = []                        # initialize the resulting list 
    
    with open(f'data/characters/{name}.txt', encoding="utf8") as file: # 1. open the file
        content = file.read()               # 2. read its content
        match = ' '.join(re.findall(pattern_biography, content))
        
        if len(match) < 1:
             match += ' '.join(re.findall(pattern_cast, content))
        if len(match) < 1:
            match += ' '.join(re.findall(patter_character_information_coworker, content))
        if len(match) < 1:
             match += ' '.join(re.findall(pattern_history, content))
        if len(match) < 1:
             match += ' '.join(re.findall(pattern_character_information, content))
        if len(match) < 1:
             match += ' '.join(re.findall(pattern_text_only, content))
        if len(match) < 1:
             match += ' '.join(re.findall(pattern_text_only_2, content))
        if len(match) < 1:
             match += ' '.join(re.findall(pattern_text_only_3, content))
                
        match += ' '.join(re.findall(pattern_season, content))   
    if len(match) < 1:
        print(f"OOOOHHH NOOOO, Coult not find any matches for {name}. -------------------------")
    
    updated_match = match
    
    updated_match = re.sub(pattern1, '', updated_match)
    updated_match = re.sub(pattern2, '', updated_match)
    updated_match = re.sub(pattern3, '', updated_match) 
    updated_match = re.sub(pattern4, '', updated_match)
    updated_match = re.sub(pattern5, '', updated_match)
    updated_match = re.sub(pattern6, '', updated_match)
    updated_match = re.sub(pattern7, '', updated_match)
    updated_match = re.sub(pattern8, '', updated_match)
    updated_match = re.sub(pattern9, '', updated_match)
    updated_match = re.sub(pattern10, '', updated_match)

    pattern1_match = re.findall(pattern1, updated_match)
    pattern3_match = re.findall(pattern3, updated_match) 
    pattern4_match = re.findall(pattern4, updated_match) 

    processed_text = updated_match + ' ' + \
                ' '.join(pattern1_match) + ' ' + \
                ' '.join(pattern3_match) + ' ' + \
                ' '.join(pattern4_match) 
    
    # Remove HTML Tags
    CLEANR = re.compile('<.*?>') 
    processed_text = re.sub(CLEANR, ' ', processed_text) 
    
    return processed_text 


def get_tokens(processed_text, character_name=None, character_names=None):
    # 3. Tokenize content
    tk = WordPunctTokenizer()
    tokens = tk.tokenize(processed_text) 
    
    # 4. Remove non-alpha numeric tokens
    tokens = list(filter(lambda t: t.isalpha(), tokens))
 

    stop_words = set(stopwords.words('english')) 
    stop_words.update(['tell', 'say', 'ask', 'asks', 'says', 'later', 'dunder', 'mifflin', 'name', 'eventually', \
                       'office', 'episode', 'episodethe', 'become', 'becomes', 'became', 'take', 'show', 'seen', 'see',\
                       'episodes', 'character', 'nbc', 'season', 'wiki', 'deleted', 'portrayed', 'television', 'fictional', 'appears', \
                       'appearance', 'get', 'seasons', 'one', 'series', 'make', 'yes', 'history', 'first', 'titled', 'scene','wikipedia', 'played'])
    stop_words.update(["oh", "yeah", "ok", "dont", "hey", "okay", "know", "right", "well"])

    lemmatizer = WordNetLemmatizer()  # used to lemantize the words

    filtered_tokens = []
    for t in tokens: 
        t_lower = t.lower() # convert text to lowercase
        if t_lower in stop_words: continue
        if character_name and t_lower == character_name.lower(): continue
        t_lower_lemantized = lemmatizer.lemmatize(t_lower) # lemantize the lowercase word
        if t_lower_lemantized in stop_words: continue
        if character_names and t_lower in character_names: continue
        filtered_tokens.append(t_lower_lemantized) # add it to the final list of tokens
    
    return filtered_tokens

In [13]:
df = pd.read_csv('data/characters.csv')

character_names = df.Name
lower_pluralized_character_names = []

for name in character_names:
    n = name.lower()
    for n_ in n.split(' '):
        lower_pluralized_character_names.append(n_) # separate names and surnames
    lower_pluralized_character_names.append(pluralize(n.split(' ')[0]))

main_characters = ['Andy Bernard', 'Angela Martin', 'Creed Bratton', 'Darryl Philbin', 'Dwight Schrute', \
                  'Jim Halpert', 'Kelly Kapoor', 'Kevin Malone', 'Meredith Palmer', 'Michael Scott', 'Pam Beesly', \
                  'Phyllis Vance', 'Ryan Howard', 'Stanley Hudson', 'Oscar Martinez', 'Toby Flenderson']

main_characters_df = df.loc[df['Name'].isin(main_characters)]
main_characters_tokens = {}
for index, row in main_characters_df.iterrows():                       # number of files belonging to this race
        character_name = row['Name']
        processed_text = process_text(character_name) 
        tokens = get_tokens(processed_text, character_name.split(' ')[0], lower_pluralized_character_names)
        main_characters_tokens[character_name] = tokens

with open('data/main_characters_tokens.csv', "w+") as file:
    file.write(json.dumps(main_characters_tokens))  

In [24]:
with open('data/top_five_communities.csv', 'r') as file:
    top_five_communities = json.loads(file.read())
    
top_five_communities

[[6,
  ['Alan',
   'Kelly Kapoor',
   'Ryan Howard',
   'Karen Filippelli',
   'Alice',
   'Pete Miller',
   'Andy Bernard',
   'Erin Hannon',
   'Unnamed Cousin',
   'Walter Bernard Jr.',
   'Ellen Bernard',
   'Robert California',
   'Deangelo Vickers',
   'Nellie Bertram',
   'Gabe Lewis',
   'Darryl Philbin',
   'Jo Bennett',
   'Jamie',
   "Jessica (Andy's Girlfriend)",
   'Carla Fern',
   'Ashley',
   'Belsnickel',
   'Nate Nickerson',
   'Bert California',
   'Brandon',
   'Val Johnson',
   'Broccoli Rob',
   'Carla',
   'Casey Dean',
   'Clark Green',
   'Colin',
   'Julius Erving',
   "Dan (Karen's husband)",
   'Jada Philbin',
   'Gwyneth Philbin',
   'Drake Howard',
   'Reed',
   "Erin's Mother",
   "Erin's Father",
   'Irene',
   'Frank',
   'Glenn (Florida)',
   'Glenn (Warehouse Worker)',
   'Philip',
   'Hidetoshi Hasagawa',
   'Justine Philbin',
   'Jordan Garfield',
   'Ravi',
   'Lonny Collins',
   'Matt',
   'Merv Bronte',
   'Pam (other)',
   'Reggie',
   'Susan Cal

In [27]:
community_dict = {}
with open('data/top_five_communities.csv') as file:
    top_five_communities = json.loads(file.read())

for community_index, community in top_five_communities:
    community_dict[community_index] = []
    for document in community:
        processed_text = process_text(document)
        tokens = get_tokens(processed_text, df.Name)
        
        community_dict[community_index].append(tokens)

with open('data/community_dict.csv', "w+") as file:
    file.write(json.dumps(community_dict)) 

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().