#### Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import plotly
import plotly_express as px
%matplotlib inline

#### Import dataset

In [None]:
dataset = "/home/rohkoder29/Documents/year2022/python/data_science/other_datasets/Highest Holywood Grossing Movies.csv"
df = pd.read_csv(dataset, index_col=False)

#### Gathering Info about the dataset

In [None]:
df.info()

In [None]:
df.head(1)

#### Clean / Accomodate the dataset

- first, let's drop the "Unnamed: 0" and "Movie Info" columns

In [None]:
df.drop(columns=["Unnamed: 0", "Movie Info"], inplace=True)

In [None]:
df.head(1)

- convert sales columns in float

In [None]:
df[["Domestic Sales (in $)", "International Sales (in $)", "World Sales (in $)"]] = df[["Domestic Sales (in $)", "International Sales (in $)", "World Sales (in $)"]].astype(np.float64)

- now, let's cast the columns "Realease Date" and "Movie Runtime" to datetime data type

In [None]:
df.info()

- but first, let's create a function to extract the hour and minute from the runtime

In [None]:
# okay since there's no seconds specified (and I happen to really care about seconds)
# we gonna inject it, randomly :)
from secrets import randbelow
def extract_hour_minute(runtime: str):
    hour = runtime.split()[0]
    minute = runtime.split()[-2]
    seconds = randbelow(60)
    return f"{hour:>02}:{minute:>02}:{seconds:>02}"

- let's apply the change

In [None]:
df["Movie Runtime"] = df["Movie Runtime"].apply(lambda x: extract_hour_minute(x))

In [None]:
df["Movie Runtime"] = pd.to_datetime(df["Movie Runtime"], format="%H:%M:%S").dt.time

In [None]:
df.head(1)

In [None]:
df.astype({"Release Date": np.datetime64})

In [None]:
import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
nums = [554215, 489349, 820389, 239042, 539002, 225345]
numbers = []
for num in nums:
    number = locale.currency(num, grouping=True)
    numbers.append(number)
print(numbers, type(numbers[0]))

In [None]:
pd.to_datetime(["02:18:29"], format="%X")

In [None]:
df.info()

In [None]:
df.head()

- we also need to deal with missing values within the dataset

In [None]:
df.isna().sum()

as you can see we've got quite a few missing values in Release Date and License (Rating)

In [None]:
df.loc[(df["Release Date"].isna() | df["License"].isna())]

[Release Date] since in each and every movie title is included a year (of release) we gonna extract it and create a special column for it

In [None]:
import re

def extract_year(title: str) -> int:
    year = re.findall("(\d{4})", title)
    return int("".join(year))

In [None]:
df["Release Year"] = df["Title"].apply(lambda x: extract_year(x))

In [None]:
df.head(1)

[License]

just to be sure, let's verify if domestic sales + international sales = world sales

In [None]:
df.loc[~((df["Domestic Sales (in $)"]) + (df["International Sales (in $)"]) == (df["World Sales (in $)"]))].count()

as I thought some values here are really odd. But for now (and for the sake of my analysis) it's not crucial.

#### Basic Statistics of the dataset

In [None]:
df.describe()

#### Exploratory Analysis

****