In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import time
import random
import pickle
import unidecode
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
from nltk.corpus import stopwords
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tag import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import MWETokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.chunk import ne_chunk
from textblob import TextBlob
from itertools import combinations
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt 
import re
import string
from word2number import w2n
import spacy
nlp = spacy.load('en_core_web_sm')
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
import nltk

# Scraping Game Scripts from Ace Attorney Wiki

In [None]:
#Function to obtain script data from Ace Attorney Wiki, also cleans up text slightly
def get_script(url):
    script = []
    case_soup = requests.get(url)
    case_trscpt = BeautifulSoup(case_soup.text)
    case_txt = case_trscpt.find('div', class_='mw-parser-output')
    for line in case_txt.find_all('p'):
        script.append(line.text.replace('\n',' ').strip(' '))
    return script

In [None]:
#Get scripts for all the cases across the first five games, input is a list of urls for the transcripts of each game
def get_cases(games):
    games_corpus = []
    for game in games:
        games_corpus.append(get_script(game))
        time.sleep(.5+2*random.random())
    return games_corpus

In [None]:
#Links to the transcript pages on Ace Attorney Wiki, spaced out by game
case_links = ['https://aceattorney.fandom.com/wiki/The_First_Turnabout_-_Transcript',
'https://aceattorney.fandom.com/wiki/Turnabout_Sisters_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Turnabout_Sisters_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Turnabout_Samurai_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Turnabout_Samurai_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Turnabout_Samurai_-_Transcript_-_Part_3',
'https://aceattorney.fandom.com/wiki/Turnabout_Goodbyes_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Turnabout_Goodbyes_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Turnabout_Goodbyes_-_Transcript_-_Part_3',
'https://aceattorney.fandom.com/wiki/Rise_from_the_Ashes_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Rise_from_the_Ashes_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Rise_from_the_Ashes_-_Transcript_-_Part_3',

'https://aceattorney.fandom.com/wiki/The_Lost_Turnabout_-_Transcript',
'https://aceattorney.fandom.com/wiki/Reunion,_and_Turnabout_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Reunion,_and_Turnabout_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Turnabout_Big_Top_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Turnabout_Big_Top_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Farewell,_My_Turnabout_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Farewell,_My_Turnabout_-_Transcript_-_Part_2',

'https://aceattorney.fandom.com/wiki/Turnabout_Memories_-_Transcript',
'https://aceattorney.fandom.com/wiki/The_Stolen_Turnabout_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/The_Stolen_Turnabout_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Recipe_for_Turnabout_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Recipe_for_Turnabout_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Turnabout_Beginnings_-_Transcript',
'https://aceattorney.fandom.com/wiki/Bridge_to_the_Turnabout_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Bridge_to_the_Turnabout_-_Transcript_-_Part_2',

'https://aceattorney.fandom.com/wiki/Turnabout_Trump_-_Transcript',
'https://aceattorney.fandom.com/wiki/Turnabout_Corner_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Turnabout_Corner_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Turnabout_Serenade_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Turnabout_Serenade_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Turnabout_Succession_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Turnabout_Succession_-_Transcript_-_Part_2',

'https://aceattorney.fandom.com/wiki/Turnabout_Countdown_-_Transcript',
'https://aceattorney.fandom.com/wiki/The_Monstrous_Turnabout_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/The_Monstrous_Turnabout_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/Turnabout_Academy_-_Transcript_-_Part_1',
'https://aceattorney.fandom.com/wiki/Turnabout_Academy_-_Transcript_-_Part_2',
'https://aceattorney.fandom.com/wiki/The_Cosmic_Turnabout_-_Transcript',
'https://aceattorney.fandom.com/wiki/Turnabout_for_Tomorrow_-_Transcript',
'https://aceattorney.fandom.com/wiki/Turnabout_Reclaimed_-_Transcript']

In [None]:
#Obtain corpus separated by case transcript links and pickle it
with open('corpusscrape.pickle', 'wb') as to_write:
    pickle.dump(get_cases(case_links), to_write)

In [None]:
#Retrieving Pickle
with open('corpusscrape.pickle','rb') as read_file:
    corpus = pickle.load(read_file)

# Text Preprocessing

In [None]:
#Corpus is broken up by transcript page, so this function puts the elements into one list
def combine_parts(list,x,y):
    new_list = []
    for i in range(x,y):
        new_list = new_list + list[i]
    return new_list

In [None]:
# Get game specific corpuses in order to track character interactions across games as well as a corpus of all the games
aa1 = combine_parts(corpus,0,12)
aa2 = combine_parts(corpus,12,19)
aa3 = combine_parts(corpus,19,27)
aa4 = combine_parts(corpus,27,34)
aa5 = combine_parts(corpus,34,42)
aa1_5 = combine_parts(corpus,0,42)

In [None]:
#Based on different dialogue choices, the transcript indicates what the next line is with leads to or leads back to, so I'm removing them because it's redundant
def remove_leads(corp):
    new_list = []
    for line in corp:
        if line[:5] != 'Leads':
            new_list.append(line)
    return new_list

In [None]:
#Removing redundant game dialogue boxes
aa1_5 = remove_leads(aa1_5)
aa1 = remove_leads(aa1)
aa2 = remove_leads(aa2)
aa3 = remove_leads(aa3)
aa4 = remove_leads(aa4)
aa5 = remove_leads(aa5)

In [None]:
#Function to change pos_tag to format for wordnetlemmatizer
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
# Function to preprocess text
def txt_preprocess(case):
    new_list = []
    for doc in case:
        #remove punctuation and white space
        doc = re.sub('[%s]' % re.escape(string.punctuation), '', doc)
        #remove accented characters
        doc = unidecode.unidecode(doc)
        #lowercase letters
        doc = doc.lower()
        #remove numbers
        doc = re.sub('\w*\d\w*', ' ', doc)
        new_list.append(doc)
    return new_list

In [None]:
#Game by game script preprocessing
aa1 = txt_preprocess(aa1)
aa2 = txt_preprocess(aa2)
aa3 = txt_preprocess(aa3)
aa4 = txt_preprocess(aa4)
aa5 = txt_preprocess(aa5)
aa1_5 = txt_preprocess(aa1_5)

# LDA on Full Corpus

In [None]:
#Count Vectorizing the entire corpus aa1_5
cv = CountVectorizer(stop_words='english',token_pattern = r'\b[a-zA-Z]{3,}\b',analyzer='word',max_df = 0.5, min_df = 10, max_features=3000)
dtm_tf = cv.fit_transform(aa1_5)
Char = pd.DataFrame(dtm_tf.toarray(), columns=cv.get_feature_names())
Char

In [None]:
#TFIDF Doc-Term Matrix
tfidf_vectorizer = TfidfVectorizer(**cv.get_params())
dtm_tfidf = tfidf_vectorizer.fit_transform(Char)
dtm_tfidf

In [None]:
# for TF DTM
lda_tf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tf.fit(dtm_tf)

# for TFIDF DTM
lda_tfidf = LatentDirichletAllocation(n_components=10, random_state=0)
lda_tfidf.fit(dtm_tfidf)

In [None]:
#Create webpage to record analyzing topics
full = pyLDAvis.sklearn.prepare(lda_tfidf, dtm_tfidf, cv)
pyLDAvis.save_html(full, 'full.html')

# LSA on Character Interactions

In [None]:
#Finds list of indexes that contain two character lines and when they are talking to each other or in the same group discussion (distance of <=2 indexes)
def interlist(char1,char2):    
    interaction = []
    for ind in char1:
        if ind - 2 in char2:
            interaction.append(ind-2)
            interaction.append(ind)
        if ind - 1 in char2:
            interaction.append(ind-1)
            interaction.append(ind)
        if ind + 1 in char2:
            interaction.append(ind+1)
            interaction.append(ind)
        if ind + 2 in char2:
            interaction.append(ind+2)
            interaction.append(ind)
    inter = sorted(list(set(interaction)))
    return inter

In [None]:
#List of indexes for each specific character
nick = list(Char[(Char['phoenix'] > 0)].index)
maya = list(Char[(Char['maya'] > 0)].index)
apollo = list(Char[(Char['apollo'] > 0)].index)
miles = list(Char[(Char['edgeworth'] > 0)].index)
fran = list(Char[(Char['von'] > 0) &(Char['karma'] > 0)].index)
godot = list(Char[(Char['godot'] > 0)].index)
klav = list(Char[(Char['klavier'] > 0)].index)
black = list(Char[(Char['blackquill'] > 0)].index)

In [None]:
#List of indexes for interaction analysis
int1 = [aa1_5[i] for i in interlist(miles,nick)]
int2 = [aa1_5[i] for i in interlist(fran,nick)]
int3 = [aa1_5[i] for i in interlist(godot,nick)]
int4 = [aa1_5[i] for i in interlist(klav,apollo)]
int5 = [aa1_5[i] for i in interlist(black,nick)]

In [None]:
#Interaction related Doc-Term Matrix
vectorizer = CountVectorizer(stop_words='english',max_df = 0.5, min_df = 10, max_features=3000)
doc_word = vectorizer.fit_transform(int5)
doc_word.shape

In [None]:
dtm_lsa = pd.DataFrame(doc_word.toarray(), index=int5, columns=vectorizer.get_feature_names()).head(10)

In [None]:
#LSA modeling across chosen number of topics, here 5
lsa = TruncatedSVD(5)
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_

In [None]:
#Pull words from Matrix
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ["component_1","component_2","component_3","component_4","component_5"],
             columns = vectorizer.get_feature_names())
topic_word

In [None]:
#Produce top words from each topic
def display_topics(model, feature_names, no_top_words, topic_names=None):
    for ix, topic in enumerate(model.components_):
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [None]:
display_topics(lsa, vectorizer.get_feature_names(), 5)

# Word Clouds for each Doc-Term Matrix

In [None]:
# Word Cloud Generator
df = pd.DataFrame(Char) 
  
comment_words = '' 
stopwords = set(STOPWORDS) 
  

for val in df[0]: 
      
    
    val = str(val) 
  
    
    tokens = val.split() 
      
    
    for i in range(len(tokens)): 
        tokens[i] = tokens[i].lower() 
      
    comment_words += " ".join(tokens)+" "
  
wordcloud = WordCloud(width = 800, height = 800, 
                background_color ='white', 
                stopwords = stopwords, 
                min_font_size = 10).generate(comment_words) 
  

plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show() 