In [None]:
!pip install NRCLex

In [None]:
import pandas as pd
import numpy as np
from IPython.display import display
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nrclex import NRCLex
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')

In [None]:
def process_df(file_path):

    def explode(df, lst_cols, fill_value='', preserve_index=False):
        idx_cols = df.columns.difference(lst_cols)
        lens = df[lst_cols[0]].str.len()
        idx = np.repeat(df.index.values, lens)

        res = (pd.DataFrame({
                    col:np.repeat(df[col].values, lens)
                    for col in idx_cols},
                    index=idx)
                 .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                                for col in lst_cols}))

        if (lens == 0).any():
            # at least one list in cells is empty
            res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                      .fillna(fill_value))
            
        res = res.sort_index()
        
        if not preserve_index:        
            res = res.reset_index(drop=True)
        return res

    df = pd.read_csv(file_path)
    df['dialog_processed']=df['dialog'].apply(lambda s: sent_tokenize(str(s).lower()))
    return explode(df,['dialog_processed'])

def show_vocabulary_scores(movie_df,n=6):
    dialog_count = movie_df.groupby('character').count().reset_index().sort_values(['dialog_processed'], ascending=False)
    
    top_ch = dialog_count['character'][:n]
    
    fig,axs = plt.subplots(figsize=(25,9),nrows=2,ncols=n//2,sharex=True,sharey=True)
    if len(movie_df.movie.unique())>1:
        fig.suptitle(f"Harry Potter movies\n{n} most speaking characters vocabulary  score")
    else:
        fig.suptitle(f"{movie_df.movie[0]}\n{n} most speaking characters vocabulary  score")
    axs_=axs.flatten()
    for j,ch in enumerate(top_ch):
        ax = axs_[j]
        
        ch_dialog = list(movie_df[movie_df['character']==ch]['dialog_processed'])
        text_object = NRCLex(' '.join(ch_dialog))
        
        sentiment_scores = pd.DataFrame(list(text_object.raw_emotion_scores.items()))
        sentiment_scores = sentiment_scores.rename(columns={0: "sentiment", 1: "score"}).sort_values(['sentiment'])
        sentiment_scores['score']= sentiment_scores['score']/(sentiment_scores.sum()["score"])
        sns.barplot(x='score',y='sentiment', data=sentiment_scores, palette='Spectral',ax=ax)
        ax.set_title(f"{ch} ({int(dialog_count[dialog_count['character'] == ch]['dialog_processed'])} sentences)")
    return fig.show()
    

In [None]:
main_path = "../input/harry-potter-movies-dataset/datasets"
movies_df = [process_df(f"{main_path}/hp{i}.csv") for i in range(1,9)]

## Harry Potter and the Philosopher's Stone

In [None]:
show_vocabulary_scores(movies_df[0])

## Harry Potter and the Chamber of Secrets

In [None]:
show_vocabulary_scores(movies_df[1])

## Harry Potter and the Prisoner of Azkaban

In [None]:
show_vocabulary_scores(movies_df[2])

## Harry Potter and the Gobelt of Fire

In [None]:
show_vocabulary_scores(movies_df[3])

## Harry Potter and the Order of the Phoenix

In [None]:
show_vocabulary_scores(movies_df[4])

## Harry Potter and the Half-Blood Prince

In [None]:
show_vocabulary_scores(movies_df[5])

## Harry Potter and the Deathly Hallows Part 1

In [None]:
show_vocabulary_scores(movies_df[6])

## Harry Potter and the Deathly Hallows Part 2

In [None]:
show_vocabulary_scores(movies_df[7])

## All movies

In [None]:
all_movies_df = pd.concat(movies_df,ignore_index=True)
show_vocabulary_scores(all_movies_df,n=10)