In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

data = pd.read_csv('../input/movie_metadata.csv')

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.color.value_counts()

In [None]:
sns.boxplot(x='color', y='imdb_score', data=data)

In [None]:
sns.boxplot(x='color', y='duration', data=data)

In [None]:
f, axes = plt.subplots(2, 1, sharex=True)
sns.violinplot(x='title_year', y='color', data=data, ax=axes[0])
sns.distplot(data.title_year[-pd.isnull(data.title_year)], kde=False, ax=axes[1])

Here are some of the columns I'm interested in comparing to one another.

In [None]:
pair_data = data[['duration', 'gross', 'budget', 'title_year', 'imdb_score', 'aspect_ratio']]
pair_data = pair_data.dropna()

In [None]:
sns.distplot(data.budget[data.budget < data.budget.mean() + data.budget.std() * 2])

In [None]:
g = sns.PairGrid(pair_data)
g.map_offdiag(plt.scatter)
g.map_diag(plt.hist)

In [None]:
print(data.budget.median())
print(data.budget.mean() - data.budget.median())

As you can see the budget of movies is terribly skewed to one side. The difference between the median and the mean is almost as big as the median itself. For this we will take all movies lower than three standard deviations over the mean.

In [None]:
sns.distplot(data.budget[data.budget < data.budget.mean() + data.budget.std() * 3])

Still heavily skewed but a better representation.

Have you ever seen really long director names and wondered whether or not that magically made them a better director?

In [None]:
data['director_name_length'] = data.director_name.map(lambda x: len(str(x)))
data.director_name_length.head()

In [None]:
sns.distplot(data.director_name_length)

In [None]:
sns.regplot(x='director_name_length', y='imdb_score', data=data, lowess=True)

Without a local regression it would almost appear to be so, but for the most part it just has to do with distributions. The larger the distribution the more the distribution stretches into the worse scores.

Now what about R rated movies. You're sure you saw plenty win Oscars when you were a kid but your mother wouldn't let you see them.

In [None]:
sns.violinplot(y='content_rating', x='imdb_score', data=data)

Well R does pretty good but the real money seems to be in TV-MA.

What about the score of R rated movies over time.

In [None]:
r_data = data[data.content_rating == 'R']
sns.regplot(x='title_year', y='imdb_score', data=r_data, lowess=True)

R rated movies increased in number over time which brought it's overall value down.