# Let's begin our adventure to **Misty Mountains** on Middle Earth!

> **Far over the misty mountains cold **
<br>
> **To dungeons deep and caverns old**
<br>
> **We must away ere break of day**
<br>
> **To find our long-forgotten gold!**

![LOTR](https://wallpapercave.com/wp/PQVpQko.jpg)

In [None]:
!pip install chart-studio

In [None]:
# Data processing libraries
import numpy as np 
import pandas as pd 

# Visualization libraries
import datetime
import matplotlib
import matplotlib.pyplot as plt 
import seaborn as sns
%matplotlib inline
sns.set()

# Plotly visualization libraries
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import chart_studio.plotly as py
from plotly.graph_objs import *
from IPython.display import Image
pd.set_option('display.max_rows', None)

import plotly.graph_objs as go
from plotly import tools
from plotly.offline import iplot, init_notebook_mode
init_notebook_mode()

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Read LOTR data into dataframes

In [None]:
char_df = pd.read_csv('../input/lord-of-the-rings-data/lotr_characters.csv')
script_df = pd.read_csv('../input/lord-of-the-rings-data/lotr_scripts.csv')

### Get the metadata about the datasets

In [None]:
char_df.head()

In [None]:
script_df.head()

### Group the characters from the scripts by movies
In order to scale the data, bring the data in log scale

In [None]:
records = script_df.groupby(['movie']).size()
records = records.sort_values()

grouped_df = pd.DataFrame(records)

grouped_df['Count'] = pd.Series(records).values
grouped_df['Movies'] = grouped_df.index
grouped_df['Log Count'] = np.log(grouped_df['Count'])
grouped_df.head()

**Bar Chart**

In [None]:
fig = go.Figure(go.Bar(
    x = grouped_df['Movies'],
    y = grouped_df['Log Count'],
    text=['Bar Chart'],
    name='LOTR Movies',
    marker_color=grouped_df['Count']
))

fig.update_layout(
    height=800,
    title_text='Movies distribution in the LOTR Trilogy',
    showlegend=True
)

fig.show()

### **Visualize the distribution of characters on Pie Chart**

In [None]:
char_df.head()

### Group by gender

In [None]:
gender_df = char_df[['gender','name', 'spouse']]
gender_df.head()

In [None]:
gen_df = gender_df.groupby('gender')['name'].value_counts().reset_index(name='count')
gen_df['count'] = gender_df.groupby('gender')['name'].transform('size')
gen_df.head()

### Count the characters present across all the genders

In [None]:
test_df = gender_df
df = test_df.groupby(['gender'], as_index=False, sort=False)['name'].count()
df.head()

In [None]:
fig = px.pie(df, values='name', names='gender')
fig.show()

### Visualize the character composition in LOTR

In [None]:
tdf = char_df.groupby(['race'], as_index=False, sort=False)['name'].count()
tdf.head()

In [None]:
fig = px.pie(tdf, values='name', names='race')
fig.show()

In [None]:
char_df.head()

### Analyze the scripts for the triology

In [None]:
script_df.head()

### Count the number of occurences of each character in the dialogues across triology

In [None]:
sdf = script_df.groupby('char')['movie'].value_counts().reset_index(name='count')
sdf['count'] = script_df.groupby('char')['movie'].transform('size')

sdf.head()

In [None]:
fig = px.pie(sdf, values='count', names='char')
fig.show()

### Generate Wordcloud for every character

In [None]:
def generate_wordcloud(dffile, user):

    # remove stopwords from text field
    comment_words = '' 
    stopwords = set(STOPWORDS)

    # iterate through the csv file 
    for val in dffile.Text_Lemma: 
        
        # typecaste each val to string 
        val = str(val) 
    
        # split the value 
        tokens = val.split() 
        
        # Converts each token into lowercase 
        for i in range(len(tokens)): 
            tokens[i] = tokens[i].lower() 
        
        comment_words += " ".join(tokens)+" "

    # generate wordcloud over comment_words
    wordcloud = WordCloud(width = 800, height = 800, 
        background_color ='white', 
        prefer_horizontal=1, font_path='Arial',
        stopwords = stopwords, 
        min_font_size = 10).generate(comment_words) 

    # plot the WordCloud image                        
    plt.figure(figsize = (8, 8), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 

    img_data = io.BytesIO()
    plt.savefig(img_data, format='png')
    img_data.seek(0)

In [None]:
def generate_mask_wordcloud(dffile, obj, user):

    # remove stopwords from text field
    comment_words = '' 
    stopwords = set(STOPWORDS)

    # iterate through the csv file 
    for val in dffile.text: 
        
        # typecaste each val to string 
        val = str(val) 
    
        # split the value 
        tokens = val.split() 
        
        # Converts each token into lowercase 
        for i in range(len(tokens)): 
            tokens[i] = tokens[i].lower() 
        
        comment_words += " ".join(tokens)+" "

    # generate mask from the uploaded image
    mask = np.array(Image.open(obj['Body']))

    # generate wordcloud over comment_words enclosed in mask
    wordcloud_usa = WordCloud(stopwords=stopwords, background_color="white", mode="RGBA", max_words=1000, mask=mask).generate(comment_words)

    # create coloring from image
    image_colors = ImageColorGenerator(mask)
    plt.figure(figsize=[7,7])
    plt.imshow(wordcloud_usa.recolor(color_func=image_colors), interpolation="bilinear")
    plt.axis("off")

    img_data = io.BytesIO()
    plt.savefig(img_data, format='png')
    img_data.seek(0)