# Getting started

In this notebook, I'm analyzing some metrics in IMDB Indonesian movies dataset. I hope to learn more about data visualization using Python. Any comments and suggestion is very welcomed!

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
rcParams['font.family'] = 'sans-serif'
rcParams['font.sans-serif'] = ['DejaVu Sans']
rcParams['font.size'] = 12

import seaborn as sns
sns.set_style("dark")

import numpy as np
import pandas as pd


# Importing Data

In [None]:
df = pd.read_csv('/kaggle/input/indonesian_movies.csv')
nRow, nCol = df.shape
print(f'{nRow} rows & {nCol} columns')
df.head()

In [None]:
df.info()

In [None]:
df.describe(include="all")

Let's see how many genres there are:

In [None]:
print(df['genre'].nunique(), 'unique genres:')
print(df['genre'].unique())

# NaN genre
df['genre'].notna().value_counts()

And how many types of rating there are:

In [None]:
print(df['rating'].nunique(), 'unique ratings')
print(df['rating'].unique())

# NaN rating
df['rating'].notna().value_counts()

# Cleaning Data

In [None]:
df = df.dropna(subset=['genre', 'directors'])
df = df.reset_index(drop=True)

Simplifying rating categories:
* 'SU' (All Ages)
* '13+' = 'R' = 'PG-13' = 'TV-14'
* '17+' = 'TV-MA' = 'D' = '21+'
* 'Unrated' = 'Not Rated'

We'll also fill null values with "Unrated"

In [None]:
df['rating'] = df['rating'].fillna("Unrated")
df['rating'] = df['rating'].replace({
    "Not Rated": "Unrated",
    "R": "13+",
    "PG-13": "13+",
    "TV-14": "13+",
    "TV-MA": "17+",
    "D": "17+",
    "21+": "17+"
})

In [None]:
df["votes"] = df["votes"].apply(lambda x: x.replace(",", ""))
df["votes"] = df["votes"].astype(int)

In [None]:
df["runtime"].value_counts()

In [None]:
df["runtime"] = df["runtime"].fillna("90")
df["runtime"] = df["runtime"].apply(lambda x: x.replace(" min", ""))
df["runtime"] = df["runtime"].astype(int)

# Exploring our data

## Movies by year

In [None]:
f, ax = plt.subplots(figsize=(8, 4))
sns.countplot(x="year", data=df)
# x-tick interval
for label in ax.get_xticklabels():
    if np.int(label.get_text()) % 10 == 0:  
        label.set_visible(True)
    else:
        label.set_visible(False)

## Top oldest movies

In [None]:
df.sort_values("year", ascending = True).head()

## Movies by language

In [None]:
df["languages"].value_counts().to_frame().rename(columns={"languages": "total movies"})

## Movies by TV ratings

In [None]:
plt.figure(num = None, dpi = 75)
valueCounts = df["rating"].value_counts()

valueCounts.plot.bar(color='crimson')
plt.ylabel('counts')
plt.xticks(rotation=0)
plt.show()

## Total movies by genre

In [None]:
df["genre"].value_counts().to_frame().rename(columns={"genre": "total movies"})

## Top rated movies in each genre

In [None]:
for genre in df["genre"].unique():
    data = df[df["genre"] == genre]
    print("The #1 movie in the genre", genre, "is =>", data.sort_values("users_rating", ascending = False).head(1)["title"].values[0])

## Movie description word cloud

In [None]:
import random
from wordcloud import WordCloud
text = (str(df["description"].dropna()))
plt.subplots(figsize=(9,6))
wordcloud = WordCloud(background_color="white").generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.title("Film description wordcloud")
plt.axis("off")
plt.show()

## Top directors by the number of movies directed

In [None]:
ax = df["directors"].value_counts()[:15].plot.barh(figsize=(6,6), color='crimson').invert_yaxis()
plt.ylabel(None)
plt.xlabel("Movies directed")

## Top directors by average rating of movies

In [None]:
top_directors = df[["directors", "users_rating"]].groupby(["directors"]).mean()
top_directors.sort_values('users_rating', ascending=False).rename(columns={"users_rating": "average_rating"})[:10]

## Top actors by the number of movies played

In [None]:
from itertools import chain
import re

actors = df["actors"].apply(np.array)
actorlist = pd.Series(list(chain.from_iterable(x.title().split(', ') for x in actors.str[1:-1])))
actorlist = actorlist[actorlist.str.contains("'")]
actorlist = actorlist.str.strip("'")
actorlist.value_counts()[:10].plot.barh().invert_yaxis()
plt.ylabel(None)
plt.xlabel("Movies played")

## Most common actor name

In [None]:
text = str(df["actors"].str.replace("[\[\]']", "", regex=True).replace(",", "", regex=True).replace("nan", "", regex=True))
plt.subplots(figsize=(9,6))
wordcloud = WordCloud(background_color="white", width=900,height=600).generate(text)
plt.imshow(wordcloud)
plt.title("Actors cloud")
plt.axis("off")
plt.show()

## Misc

In [None]:
df.corr()

In [None]:
sns.heatmap(df.corr())

Almost all variables have a very low correlation with each other. It seems that only runtime and users' rating have some (weak) correlation with each other