# Famous Movie Catchphrases

## Table of Contents:
* [Data cleaning](#1)
* [Movies with multiple catchphrases](#2)
* [Movies series](#3)
* [Wordclouds](#4)

In [None]:
# packages

# standard
import numpy as np
import pandas as pd
import time

# plots
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

# NLP
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

In [None]:
# import and preview
df = pd.read_csv('../input/150-famous-movie-catchphrases-with-context/Catchphrase.csv')
df.head()

<a id='1'></a>
# Data cleaning

### A few rows have to be fixed first:

In [None]:
df.iloc[23]

In [None]:
# let's fix this
df.loc[23,'Movie Name'] = 'RAMBO III'
df.loc[23,'Context'] = 'Your worst nightmare.'

In [None]:
df.iloc[25]

In [None]:
# let's fix this
df.loc[25,'Catchphrase'] = "Surely, you can't be serious. I am serious, and don't call me Shirley."
df.loc[25,'Movie Name'] = 'AIRPLANE!'
df.loc[25,'Context'] = ''

In [None]:
df.iloc[26]

In [None]:
# let's fix this
df.loc[26,'Catchphrase'] = "Dammit, man, I'm a DOCTOR, not"
df.loc[26,'Movie Name'] = 'STAR TREK'
df.loc[26,'Context'] = ''

In [None]:
df.iloc[126]

In [None]:
# let's fix this
df.loc[126,'Catchphrase'] = "That's gonna leave a mark."
df.loc[126,'Movie Name'] = 'TOMMY BOY'
df.loc[126,'Context'] = ''

In [None]:
# remove redundant leading spaces
df['Movie Name'] = df['Movie Name'].apply(lambda x : x.lstrip())

In [None]:
# now show all titles (sorted)
movie_list = df['Movie Name'].value_counts().index.to_list()
movie_list.sort() # sort alphabetically
print(movie_list)

<a id='2'></a>
# Movies with multiple catchphrases

In [None]:
multi = pd.DataFrame(df['Movie Name'].value_counts())
multi = multi.rename({'Movie Name' : 'Frequency'}, axis=1)
multi = multi[multi.Frequency>1]
display(multi)

In [None]:
# corresponding plot
multi.plot(kind='bar')
plt.grid()
plt.show()

In [None]:
# show catchphrases from Wizard of Oz:
df[df['Movie Name']=='THE WIZARD OF OZ']

<a id='3'></a>
# Movie Series

In [None]:
title='AUSTIN POWERS'
df[df['Movie Name'].str.match(title)]

In [None]:
title='POLTERGEIST'
df[df['Movie Name'].str.match(title)]

In [None]:
title='ROCKY'
df[df['Movie Name'].str.match(title)]

In [None]:
title='STAR WARS'
df[df['Movie Name'].str.match(title)]

In [None]:
title='STAR TREK'
df[df['Movie Name'].str.match(title)]

In [None]:
title='THE GODFATHER'
df[df['Movie Name'].str.match(title)]

In [None]:
title="WAYNE'S WORLD"
df[df['Movie Name'].str.match(title)]

<a id='4'></a>
# Wordclouds

In [None]:
stopwords = set(STOPWORDS)

### Catchphrases

In [None]:
# render wordcloud
text = " ".join(txt for txt in df['Catchphrase'])
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=250,
                      width = 600, height = 400,
                      background_color='white').generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Context

In [None]:
# render wordcloud
text = " ".join(txt for txt in df['Context'])
wordcloud = WordCloud(stopwords=stopwords, max_font_size=50, max_words=250,
                      width = 600, height = 400,
                      background_color='white').generate(text)
plt.figure(figsize=(12,8))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()