In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('../input/avatar-the-last-air-bender/avatar.csv') 

In [None]:
data.head()

In [None]:
data.drop(data.columns[[0,1,3,5,7,9,10,11]], axis = 1, inplace = True)
data = data.rename(columns={"character_words":"dialogue"})

In [None]:
data.head()

In [None]:
data.isnull().sum()

In [None]:
df = data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [None]:
df.isnull().sum()

In [None]:
# Removing Characters that aren't the top 15 characters with respect to the dialogues spoken
remove = df.character.value_counts().iloc[15:].index.tolist()
for name in remove:
    df = df[df['character'] != name]

In [None]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob # for sentiment analysis
from collections import Counter 
import seaborn as sns
import matplotlib.pyplot as plt
import string

In [None]:
sns.set_style('darkgrid')
plt.rcParams['font.size'] = 14
plt.rcParams['figure.figsize'] = (9, 5)
plt.rcParams['figure.facecolor'] = '#00000000'

In [None]:
plt.figure(figsize=(15,6))
plt.xticks(rotation=90)
plt.title('Characters with most number of dialogues')
sns.countplot(x = 'character', data = df, order = df.character.value_counts().iloc[:10].index)


In [None]:
df['word_count'] = df['dialogue'].str.split().str.len()

In [None]:
plt.figure(figsize=(15,6))
plt.xticks(rotation=90)
plt.title('Words spoken per line')
sns.barplot(x='character',y='word_count',data=df, order = df.character.value_counts().iloc[:10].index)


In [None]:
total_sum = df.groupby(['character'])['word_count'].sum().reset_index()

In [None]:
total_sum = total_sum.sort_values(by='word_count', ascending=False)

In [None]:
plt.figure(figsize=(15,6))
plt.xticks(rotation=90)
plt.title('Total Character Word Counts')
sns.barplot(x='character',y='word_count',data=total_sum)

In [None]:
stop_words = set(stopwords.words('english'))

def clean(dialogue):
    dialogue = word_tokenize(dialogue)
    dialogue = [word.lower() for word in dialogue]
    punct = str.maketrans('', '', string.punctuation) 
    dialogue = [word.translate(punct) for word in dialogue] 
    dialogue = [word for word in dialogue if word.isalpha()]
    dialogue = [word for word in dialogue if not word in stop_words]
    return " ".join(dialogue)

df['clean_dialogue'] = df['dialogue'].apply(clean)

In [None]:
# Create Word Count Column for Clean Text
df['clean_word_count'] = df['clean_dialogue'].str.split().str.len()

In [None]:
f, ax = plt.subplots(figsize=(6, 7))

sns.set_color_codes("pastel")
sns.barplot(x="word_count", y="character", data=df, label="Total Word Count", color="b")

sns.set_color_codes("muted")
sns.barplot(x="clean_word_count", y="character", data=df, label="Clean Word Count", color="b")

ax.legend(bbox_to_anchor=(2, 1))
ax.set(xlim=(0, 24), ylabel="", xlabel="Total Words vs Clean Words")
sns.despine(left=True, bottom=True)

In [None]:
# Apply Sentiment Polarity to Text with TextBlob
df['polarity'] = [round(TextBlob(word).sentiment.polarity, 2) for word in df['clean_dialogue']]
df['sentiment'] = ['positive' if polarity > 0 
                             else 'negative' if polarity < 0 
                                 else 'neutral' 
                                     for polarity in df['polarity']]

In [None]:
df.sentiment.value_counts().plot(kind='pie',autopct='%1.1f%%')

In [None]:
sns.countplot(x='character',hue='sentiment',data=df,order=df.character.value_counts().iloc[:10].index)
plt.title('Character Sentiment')

In [None]:
# Build a counter function to count words
def counter(dialogue):
    cnt = Counter()
    for msgs in dialogue:
        for msg in msgs:
            cnt[msg] += 1
    return cnt

In [None]:
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

zuko = df.groupby('character')
zuko = zuko.get_group(('Zuko'))
zuko['clean_dialogue'] = zuko['clean_dialogue'].apply(lambda x: word_tokenize(x))
text_cnt = counter(zuko['clean_dialogue'])
j = text_cnt.most_common()

In [None]:
j = pd.DataFrame(j, columns = ['Words', 'Counts'])
j = j.sort_values(by='Counts', ascending=False)[1:20]

colors = sns.color_palette("dark:salmon_r",len(j))

plt.figure(figsize=(10,5))
sns.barplot(y = 'Words' , x = 'Counts', data=j, palette=colors)
plt.title('''Zuko's Most Common Words''')

In [None]:
aang= df.groupby('character')
aang = aang.get_group(('Aang'))
aang['clean_dialogue'] = aang['clean_dialogue'].apply(lambda x: word_tokenize(x))
aang_cnt = counter(aang['clean_dialogue'])

katara = df.groupby('character')
katara = katara.get_group(('Katara'))
katara['clean_dialogue'] = katara['clean_dialogue'].apply(lambda x: word_tokenize(x))
katara_cnt = counter(katara['clean_dialogue'])

sokka = df.groupby('character')
sokka = sokka.get_group(('Sokka'))
sokka['clean_dialogue'] = sokka['clean_dialogue'].apply(lambda x: word_tokenize(x))
sokka_cnt = counter(sokka['clean_dialogue'])

iroh = df.groupby('character')
iroh = iroh.get_group(('Iroh'))
iroh['clean_dialogue'] = iroh['clean_dialogue'].apply(lambda x: word_tokenize(x))
iroh_cnt = counter(iroh['clean_dialogue'])

toph = df.groupby('character')
toph = toph.get_group(('Toph'))
toph['clean_dialogue'] = toph['clean_dialogue'].apply(lambda x: word_tokenize(x))
toph_cnt = counter(toph['clean_dialogue'])


a = aang_cnt.most_common()
k = katara_cnt.most_common()
s = sokka_cnt.most_common()
t=  toph_cnt.most_common()
i= iroh_cnt.most_common()

In [None]:
i = pd.DataFrame(i, columns = ['Words', 'Counts'])
i = i.sort_values(by='Counts', ascending=False)[1:20]

colors = sns.color_palette("dark:salmon_r",len(i))

plt.figure(figsize=(10,5))
sns.barplot(y = 'Words' , x = 'Counts', data=i, palette=colors)
plt.title('''Iroh's Most Common Words''')

In [None]:
a = pd.DataFrame(a, columns = ['Words', 'Counts'])
a = a.sort_values(by='Counts', ascending=False)[1:20]

colors = sns.color_palette("YlOrBr_r",len(a))

plt.figure(figsize=(10,5))
sns.barplot(y = 'Words' , x = 'Counts', data=a, palette=colors)
plt.title('''Aang's Most Common Words''')

In [None]:
t = pd.DataFrame(t, columns = ['Words', 'Counts'])
t = t.sort_values(by='Counts', ascending=False)[1:20]

colors = sns.dark_palette("seagreen",len(t))

plt.figure(figsize=(10,5))
sns.barplot(y = 'Words' , x = 'Counts', data=t, palette=colors)
plt.title('''Toph's Most Common Words''')

In [None]:
s = pd.DataFrame(s, columns = ['Words', 'Counts'])
s = s.sort_values(by='Counts', ascending=False)[1:20]

colors = sns.color_palette("winter",len(s))

plt.figure(figsize=(10,5))
sns.barplot(y = 'Words' , x = 'Counts', data=s, palette=colors)
plt.title('''Sokka's Most Common Words''')

In [None]:
k = pd.DataFrame(k, columns = ['Words', 'Counts'])
k = k.sort_values(by='Counts', ascending=False)[1:20]

colors = sns.color_palette("winter",len(k))

plt.figure(figsize=(10,5))
sns.barplot(y = 'Words' , x = 'Counts', data=k, palette=colors)
plt.title('''Katara's Most Common Words''')

In [None]:
df2 = pd.DataFrame(columns=["Character", "Book", "words"])

In [None]:
data1 = pd.read_csv('../input/avatar-the-last-air-bender/avatar.csv')
data1.drop(data1.columns[[0,1,2,4,5,7,9,10,11]], axis = 1, inplace = True)

In [None]:
data1 = data1.rename(columns={"character_words":"dialogue"})

In [None]:
data1 = data1.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [None]:
data1.isnull().sum()

In [None]:
data1.sample()

In [None]:
removelst = data1.character.value_counts().iloc[5:].index.tolist()
for name in removelst:
    data1 = data1[data1['character'] != name]

In [None]:
data1['word_count'] = data1['dialogue'].str.split().str.len()

In [None]:
# This can be optimized
for character in ["Aang", "Katara", "Sokka"]:
    a1 = data1.loc[1:2243].loc[data1['character'] == character]["word_count"].count()
    a2 = data1.loc[2244:4057].loc[data1['character'] == character]["word_count"].count()
    a3 = data1.loc[4058:6155].loc[data1['character'] == character]["word_count"].count()
    
    df2 = df2.append({'Character' : character , 'Book' : "Water", "words" : a1} , ignore_index=True)
    df2 = df2.append({'Character' : character , 'Book' : "Earth", "words" : a2} , ignore_index=True)
    df2 = df2.append({'Character' : character , 'Book' : "Fire", "words" : a3} , ignore_index=True)
    

In [None]:
sns.barplot(x='Character',y='words', hue='Book',data=df2, order = data1.character.value_counts().iloc[:3].index)
ax.legend(bbox_to_anchor=(2, 1))
plt.title('Character Word Counts')