# Acquiring Your Data

## Reading Data From CSV Files

In [None]:
!python -m pip install pandas

In [None]:
import pandas as pd

james_bond_data = pd.read_csv("james_bond_data.csv").convert_dtypes()

In [None]:
james_bond_data.head()

## Reading Data From Other Sources

### Reading JSON

In [None]:
import pandas as pd

james_bond_data = pd.read_json("james_bond_data.json").convert_dtypes()

### Reading Excel

In [None]:
! python -m pip install openpyxl

In [None]:
import pandas as pd

james_bond_data = pd.read_excel("james_bond_data.xlsx").convert_dtypes()

### Reading Parquet

In [None]:
!python -m pip install pyarrow

In [None]:
import pandas as pd

james_bond_data = pd.read_parquet("james_bond_data.parquet").convert_dtypes()

james_bond_data

### Scraping HTML

In [None]:
!python -m pip install lxml

In [None]:
import pandas as pd

james_bond_data_html = pd.read_html(
    "https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories"
)
james_bond_tables = james_bond_data_html[1].convert_dtypes()

# Cleansing Your Data With Python

## Creating Meaningful Column Names

In [None]:
new_column_names = {
    "Release": "release_date",
    "Movie": "movie_title",
    "Bond": "bond_actor",
    "Bond_Car_MFG": "car_manufacturer",
    "US_Gross": "income_usa",
    "World_Gross": "income_world",
    "Budget ($ 000s)": "movie_budget",
    "Film_Length": "film_length",
    "Avg_User_IMDB": "imdb",
    "Avg_User_Rtn_Tom": "rotten_tomatoes",
    "Martinis": "martinis_consumed",
    "Kills_Bond": "bond_kills",
}

data = james_bond_data.rename(columns=new_column_names)

In [None]:
data.columns

## Dealing With Missing Data

In [None]:
data.info()

In [None]:
data.loc[data.isna().any(axis="columns")]

In [None]:
data = james_bond_data.rename(columns=new_column_names).combine_first(
    pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
)

In [None]:
data.loc[data.isna().any(axis="columns")]

## Handling Financial Columns

In [None]:
data[["income_usa", "income_world", "movie_budget", "film_length"]].head()

In [None]:
data = (
    james_bond_data.rename(columns=new_column_names)
    .combine_first(
        pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
    )
    .assign(
        income_usa=lambda data: (
            data["income_usa"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
    )
)

In [None]:
data = (
    james_bond_data.rename(columns=new_column_names)
    .combine_first(
        pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
    )
    .assign(
        income_usa=lambda data: (
            data["income_usa"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        income_world=lambda data: (
            data["income_world"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        movie_budget=lambda data: (
            data["movie_budget"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
    )
)

## Correcting Invalid Data Types

In [None]:
data = (
    james_bond_data.rename(columns=new_column_names)
    .combine_first(
        pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
    )
    .assign(
        income_usa=lambda data: (
            data["income_usa"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        income_world=lambda data: (
            data["income_world"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        movie_budget=lambda data: (
            data["movie_budget"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        film_length=lambda data: (
            data["film_length"].str.removesuffix("mins").astype("Int64")
        ),
    )
)

In [None]:
data[["income_usa", "income_world", "movie_budget", "film_length"]].info()

In [None]:
data[["income_usa", "income_world", "movie_budget", "film_length"]].head()

In [None]:
data[["release_date"]].info()

In [None]:
data[["release_date"]].head()

In [None]:
data = (
    james_bond_data.rename(columns=new_column_names)
    .combine_first(
        pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
    )
    .assign(
        income_usa=lambda data: (
            data["income_usa"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        income_world=lambda data: (
            data["income_world"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        movie_budget=lambda data: (
            data["movie_budget"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        film_length=lambda data: (
            data["film_length"].str.removesuffix("mins").astype("Int64")
        ),
        release_date=lambda data: pd.to_datetime(
            data["release_date"], format="%B, %Y"
        ),
        release_year=lambda data: data["release_date"].dt.year.astype("Int64"),
    )
)

In [None]:
data[["release_date", "release_year"]].head()

In [None]:
data[["release_date", "release_year"]].info()

In [None]:
data.info()

## Fixing Inconsistencies in Data

In [None]:
data[["income_usa", "income_world", "movie_budget"]].head()

In [None]:
data = (
    james_bond_data.rename(columns=new_column_names)
    .combine_first(
        pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
    )
    .assign(
        income_usa=lambda data: (
            data["income_usa"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        income_world=lambda data: (
            data["income_world"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        movie_budget=lambda data: (
            data["movie_budget"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
            * 1000
        ),
        film_length=lambda data: (
            data["film_length"].str.removesuffix("mins").astype("Int64")
        ),
        release_date=lambda data: pd.to_datetime(
            data["release_date"], format="%B, %Y"
        ),
        release_year=lambda data: data["release_date"].dt.year.astype("Int64"),
    )
)

In [None]:
data[["income_usa", "income_world", "movie_budget"]].head()

## Correcting Spelling Errors

In [None]:
data["bond_actor"].value_counts()

In [None]:
data = (
    james_bond_data.rename(columns=new_column_names)
    .combine_first(
        pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
    )
    .assign(
        income_usa=lambda data: (
            data["income_usa"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        income_world=lambda data: (
            data["income_world"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        movie_budget=lambda data: (
            data["movie_budget"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
            * 1000
        ),
        film_length=lambda data: (
            data["film_length"].str.removesuffix("mins").astype("Int64")
        ),
        release_date=lambda data: pd.to_datetime(
            data["release_date"], format="%B, %Y"
        ),
        release_year=lambda data: data["release_date"].dt.year.astype("Int64"),
        bond_actor=lambda data: (
            data["bond_actor"]
            .str.replace("Shawn", "Sean")
            .str.replace("MOORE", "Moore")
        ),
    )
)

In [None]:
data["bond_actor"].value_counts()

In [None]:
data["car_manufacturer"].value_counts()

In [None]:
data = (
    james_bond_data.rename(columns=new_column_names)
    .combine_first(
        pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
    )
    .assign(
        income_usa=lambda data: (
            data["income_usa"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        income_world=lambda data: (
            data["income_world"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        movie_budget=lambda data: (
            data["movie_budget"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
            * 1000
        ),
        film_length=lambda data: (
            data["film_length"].str.removesuffix("mins").astype("Int64")
        ),
        release_date=lambda data: pd.to_datetime(
            data["release_date"], format="%B, %Y"
        ),
        release_year=lambda data: data["release_date"].dt.year.astype("Int64"),
        bond_actor=lambda data: (
            data["bond_actor"]
            .str.replace("Shawn", "Sean")
            .str.replace("MOORE", "Moore")
        ),
        car_manufacturer=lambda data: data["car_manufacturer"].str.replace(
            "Astin", "Aston"
        ),
    )
)

In [None]:
data["car_manufacturer"].value_counts()

## Checking For Invalid Outliers

In [None]:
data[["film_length", "martinis_consumed"]].describe()

In [None]:
data = (
    james_bond_data.rename(columns=new_column_names)
    .combine_first(
        pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
    )
    .assign(
        income_usa=lambda data: (
            data["income_usa"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        income_world=lambda data: (
            data["income_world"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        movie_budget=lambda data: (
            data["movie_budget"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
            * 1000
        ),
        film_length=lambda data: (
            data["film_length"]
            .str.removesuffix("mins")
            .astype("Int64")
            .replace(1200, 120)
        ),
        release_date=lambda data: pd.to_datetime(
            data["release_date"], format="%B, %Y"
        ),
        release_year=lambda data: data["release_date"].dt.year.astype("Int64"),
        bond_actor=lambda data: (
            data["bond_actor"]
            .str.replace("Shawn", "Sean")
            .str.replace("MOORE", "Moore")
        ),
        car_manufacturer=lambda data: data["car_manufacturer"].str.replace(
            "Astin", "Aston"
        ),
        martinis_consumed=lambda data: data["martinis_consumed"].replace(
            -6, 6
        ),
    )
)

In [None]:
data[["film_length", "martinis_consumed"]].describe()

## Removing Duplicate Data

In [None]:
data.loc[data.duplicated(keep=False)]

In [None]:
data = (
    james_bond_data.rename(columns=new_column_names)
    .combine_first(
        pd.DataFrame({"imdb": {10: 7.1}, "rotten_tomatoes": {10: 6.8}})
    )
    .assign(
        income_usa=lambda data: (
            data["income_usa"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        income_world=lambda data: (
            data["income_world"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
        ),
        movie_budget=lambda data: (
            data["movie_budget"]
            .replace("[$,]", "", regex=True)
            .astype("Float64")
            * 1000
        ),
        film_length=lambda data: (
            data["film_length"]
            .str.removesuffix("mins")
            .astype("Int64")
            .replace(1200, 120)
        ),
        release_date=lambda data: pd.to_datetime(
            data["release_date"], format="%B, %Y"
        ),
        release_year=lambda data: data["release_date"].dt.year.astype("Int64"),
        bond_actor=lambda data: (
            data["bond_actor"]
            .str.replace("Shawn", "Sean")
            .str.replace("MOORE", "Moore")
        ),
        car_manufacturer=lambda data: data["car_manufacturer"].str.replace(
            "Astin", "Aston"
        ),
        martinis_consumed=lambda data: data["martinis_consumed"].replace(
            -6, 6
        ),
    )
    .drop_duplicates(ignore_index=True)
)

In [None]:
data.loc[data.duplicated(keep=False)]

In [None]:
data["movie_title"].value_counts().head()

In [None]:
data["bond_actor"].value_counts()

## Storing Your Cleansed Data

In [None]:
data.to_csv("james_bond_data_cleansed.csv", index=False)

# Performing Data Analysis Using Python

## Performing a Regression Analysis

In [None]:
!python -m pip install matplotlib scikit-learn

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
ax.scatter(data["imdb"], data["rotten_tomatoes"])
ax.set_title("Scatter Plot of Ratings")
ax.set_xlabel("Average IMDB Rating")
ax.set_ylabel("Average Rotten Tomatoes Rating")
# fig.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression

x = data.loc[:, ["imdb"]]
y = data.loc[:, "rotten_tomatoes"]

model = LinearRegression()
model.fit(x, y)

r_squared = f"R-Squared: {model.score(x, y):.2f}"
best_fit = f"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}"
y_pred = model.predict(x)

fig, ax = plt.subplots()
ax.scatter(x, y)
ax.plot(x, y_pred, color="red")
ax.text(7.25, 5.5, r_squared, fontsize=10)
ax.text(7.25, 7, best_fit, fontsize=10)
ax.set_title("Scatter Plot of Ratings")
ax.set_xlabel("Average IMDb Rating")
ax.set_ylabel("Average Rotten Tomatoes Rating")
# fig.show()

## Investigating a Statistical Distribution

In [None]:
fig, ax = plt.subplots()
length = data["film_length"].value_counts(bins=7).sort_index()
length.plot.bar(
    ax=ax,
    title="Film Length Distribution",
    xlabel="Time Range (mins)",
    ylabel="Count",
)

In [None]:
data["film_length"].agg(["min", "max", "mean", "std"])

## Finding No Relationship

In [None]:
fig, ax = plt.subplots()
ax.scatter(data["imdb"], data["bond_kills"])
ax.set_title("Scatter Plot of Kills vs Ratings")
ax.set_xlabel("Average IMDb Rating")
ax.set_ylabel("Kills by Bond")
# fig.show()