# Introduction

Hello,

I was a big fan of the Game of Thrones tv series back in the day, and I really enjoyed watching this show (still am from time to time actually), but the finale... it haunts me... even almost two years later...
Of course I'm joking, but still series finale was disappointing. Someone might say that it was kinda predictable, because the show ran out of books, that it was originally adapting to the screen and was drifting away from its original glory for a couple of seasons already.

In my opinion this meme sums it up perfectly:

![meme](https://i.pinimg.com/originals/36/25/93/362593f8636a712baac21c0120cdc87a.png)

Interesting and intelligent dialogues were one of the main reasons why people enjoyed this show so much, but in the last seasons writing has changed for the worse.

That's why I decided to analyze Game of Thrones dialogues using [this incredible dataset](https://www.kaggle.com/albenft/game-of-thrones-script-all-seasons) I found on Kaggle.

# Table of contents

- Meeting data and doing a bit of preprocessing;

- Visualizations:

    - Number of episodes per season;

    - Number of sentences said per season;

    - Number of different talking characters per season;

    - Top 10 most talkative characters per season;

    - Top 10 most used phrases per season;

    - Top 20 most used words per season;

- Conclusion and WordCloud.

# Meeting data and doing a bit of preprocessing

In [None]:
import numpy as np
import pandas as pd

data = pd.read_csv('/kaggle/input/game-of-thrones-script-all-seasons/Game_of_Thrones_Script.csv')
data.head()

In [None]:
data.tail()

In [None]:
data.shape

In [None]:
data.info()

In [None]:
data.loc[data.Name.isna()]

In [None]:
data.Name.loc[data.loc[data.Name.isna()].index] = 'Unknown'

In [None]:
data.loc[data.Name == 'Unknown']

In [None]:
data.Name.nunique()

In [None]:
data.Name.value_counts().head(25)

In [None]:
data.loc[data.Name.str.contains('myrcella')].Name.unique()

I found out that there were two Myrcellas: 'myrcella baratheon' and just 'myrcella'. And decided to put them both together as 'myrcella lannister', because joffrey and tommen both have lannister surname in this dataset, so it seems it is no longer a secret :) 

In [None]:
data.Name.loc[data.loc[data.Name.str.contains('myrcella')].index] = 'myrcella lannister'
data.Name.loc[data.loc[data.Name == 'king joffrey'].index] = 'joffrey lannister'

In [None]:
data.drop(data.loc[(data.Sentence == 'EPISODE') | (data.Sentence == 'CREDITS')].index, axis = 0, inplace = True)

In [None]:
data['Name'] = data.Name.apply(lambda x: x.title())

# Visualization

In [None]:
pd.plotting.register_matplotlib_converters()
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

sns.set_style('whitegrid')

In [None]:
episodes_per_season = pd.Series(data.groupby('Season').Episode.apply(lambda x: x.nunique()), name = 'episodes_per_season')
episodes_per_season

In [None]:
plt.figure(figsize = (16,6))
plot = sns.barplot(x = episodes_per_season.index, y = episodes_per_season.values, palette = 'winter_r')
plot.set_title('Episodes per season')
plot.set_xlabel('Seasons')
plot.set_ylabel('Episodes')

In [None]:
phrases_per_season = pd.Series(data.groupby('Season').Sentence.apply(lambda x: x.count()), name = 'phrases_per_season')
phrases_per_season

In [None]:
plt.figure(figsize = (16,6))
plot = sns.barplot(x = phrases_per_season.index, y = phrases_per_season.values, palette = 'winter_r')
plot.set_title('Phrases used per season')
plot.set_xlabel('Seasons')
plot.set_ylabel('Phrases used')

In [None]:
characters_per_season = pd.Series(data.groupby('Season').Name.apply(lambda x: x.nunique()), name = 'characters_per_season')
characters_per_season

In [None]:
plt.figure(figsize = (16,6))
plot = sns.barplot(x = characters_per_season.index, y = characters_per_season.values, palette = 'winter_r')
plot.set_title('Talking characters per season')
plot.set_xlabel('Seasons')
plot.set_ylabel('Talking characters')

In [None]:
most_talkative_chars = pd.Series(data.groupby('Season').Name.apply(lambda x: x.value_counts()[:10]))

most_talkative_chars = most_talkative_chars.reset_index().copy()
most_talkative_chars.rename(columns = {'level_1' : 'Name', 'Name' : 'Count'}, inplace = True)
most_talkative_chars.sample(5)

In [None]:
fig, axs = plt.subplots(4, 2, figsize = (16, 10))

fig.suptitle('Top 10 most talkative characters per season', fontsize = 16)

for i in range(0, 4):
    sns.barplot(y = most_talkative_chars.Name.loc[most_talkative_chars.Season == 'Season ' + str(i * 2 + 1)], 
                x = most_talkative_chars.Count, ax = axs[i, 0], palette = 'winter_r')
    axs[i, 0].set_title('Season ' + str(i * 2 + 1))
    axs[i, 0].set_ylabel('')
    axs[i, 0].set_xlabel('Phrases spoken')
    
    sns.barplot(y = most_talkative_chars.Name.loc[most_talkative_chars.Season == 'Season ' + str(i * 2 + 2)], 
                x = most_talkative_chars.Count, ax = axs[i, 1], palette = 'winter_r')
    axs[i, 1].set_title('Season ' + str(i * 2 + 2))
    axs[i, 1].set_ylabel('')
    axs[i, 1].set_xlabel('Phrases spoken')
fig.tight_layout()

If you look closely at Season 8 you can see that there are Man and Soldier in the characters list. Let's see how these observations look like!

In [None]:
data.loc[(data.Season == 'Season 8') & (data.Name == 'Soldier')].sample(5)

In [None]:
data.loc[(data.Season == 'Season 8') & (data.Name == 'Man')].sample(5)

In [None]:
most_used_phrases = pd.Series(data.groupby('Season').Sentence.apply(lambda x: x.str.lower().replace(r'[^\w\s]+', '', regex = True).value_counts()[:10]))

most_used_phrases = most_used_phrases.reset_index().copy()
most_used_phrases.rename(columns = {'level_1' : 'Phrase', 'Sentence' : 'Count'}, inplace = True)
most_used_phrases.sample(5)

In [None]:
fig, axs = plt.subplots(4, 2, figsize = (16, 10))

fig.suptitle('Top 10 most used phrases per season', fontsize = 16)

for i in range(0, 4):
    sns.barplot(y = most_used_phrases.Phrase.loc[most_used_phrases.Season == 'Season ' + str(i * 2 + 1)], 
                x = most_used_phrases.Count, ax = axs[i, 0], palette = 'winter_r')
    axs[i, 0].set_title('Season ' + str(i * 2 + 1))
    axs[i, 0].set_ylabel('')
    axs[i, 0].set_xlabel('Times spoken')
    
    sns.barplot(y = most_used_phrases.Phrase.loc[most_used_phrases.Season == 'Season ' + str(i * 2 + 2)], 
                x = most_used_phrases.Count, ax = axs[i, 1], palette = 'winter_r')
    axs[i, 1].set_title('Season ' + str(i * 2 + 2))
    axs[i, 1].set_ylabel('')
    axs[i, 1].set_xlabel('Times spoken')
fig.tight_layout()

I don't know why, but I found it funny that in every single season the most used phrase is 'no', except for the 8th season that has 'aye' instead :)

In [None]:
all_words = pd.Series(data.groupby('Season').Sentence.apply(lambda x: x.str.lower().replace(r'[^\w\s]+', '', regex = True).str.split().sum()))
all_words = all_words.explode().copy()

most_used_words = pd.Series(all_words.groupby(all_words.index).apply(lambda x: x.value_counts()[:20]))
most_used_words = most_used_words.reset_index().copy()
most_used_words.rename(columns = {'level_1' : 'Word', 'Sentence' : 'Count'}, inplace = True)
most_used_words.sample(5)

In [None]:
fig, axs = plt.subplots(4, 2, figsize = (16, 16))

fig.suptitle('Top 20 most used words per season', fontsize = 16)

for i in range(0, 4):
    sns.barplot(y = most_used_words.Word.loc[most_used_words.Season == 'Season ' + str(i * 2 + 1)], 
                x = most_used_words.Count, ax = axs[i, 0], palette = 'winter_r')
    axs[i, 0].set_title('Season ' + str(i * 2 + 1))
    axs[i, 0].set_ylabel('')
    axs[i, 0].set_xlabel('Times spoken')
    
    sns.barplot(y = most_used_words.Word.loc[most_used_words.Season == 'Season ' + str(i * 2 + 2)], 
                x = most_used_words.Count, ax = axs[i, 1], palette = 'winter_r')
    axs[i, 1].set_title('Season ' + str(i * 2 + 2))
    axs[i, 1].set_ylabel('')
    axs[i, 1].set_xlabel('Times spoken')
fig.tight_layout()

# Conclusion and WordCloud

As we can see, there are trends going on for the last two seasons:

- Less episodes;

- Less talking characters;

- Less sentences said;

And for the last visualization let's make a WordCloud just for the sake of aesthetics:

In [None]:
import wordcloud

cloud = wordcloud.WordCloud(max_words = 2000, width = 1600,height = 900).generate(' '.join(data.Sentence))

plt.figure(figsize=(20,12))
plt.imshow(cloud)
plt.axis("off")
plt.savefig("WordCloudGoT.png")