# Acquiring Your Data

## Reading Data From CSV Files

In [None]:
! python -m pip install pandas

In [None]:
import pandas as pd

james_bond_data = pd.read_csv("james_bond_data.csv").convert_dtypes()
james_bond_data.head()

## Reading Data From Other Sources

### Reading JSON

In [None]:
import pandas as pd

james_bond_data_json = pd.read_json("james_bond_data.json").convert_dtypes()
james_bond_data_json.head()

### Reading Excel

In [None]:
! python -m pip install openpyxl

In [None]:
import openpyxl
import pandas as pd

james_bond_data_excel = pd.read_excel("james_bond_data.xlsx").convert_dtypes()
james_bond_data_excel.head()

In [None]:
! python.exe -m pip install --upgrade pip

### Reading Parquet

In [None]:
!python -m pip install pyarrow

In [None]:
james_bond_data_parquet = pd.read_parquet(
    "james_bond_data.parquet"
).convert_dtypes()
james_bond_data_parquet.head()

### Scraping HTML

In [None]:
!python -m pip install lxml

In [None]:
james_bond_data_html = pd.read_html(
    "https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories"
)
james_bond_data_html = james_bond_data_html[1].convert_dtypes()
james_bond_data_html.head()

# Cleansing Your Data With Python

## Dealing With Missing Data

In [None]:
james_bond_data.info()

In [None]:
james_bond_data[james_bond_data.isna().any(axis="columns")]

In [None]:
data = james_bond_data.combine_first(
    pd.DataFrame({"Avg_User_IMDB": {10: 7.1}, "Avg_User_Rtn_Tom": {10: 6.8}})
)

data

## Correcting Invalid Data Types

In [None]:
data[["US_Gross", "World_Gross", "Budget ($ 000s)", "Film_Length"]].head()

In [None]:
data = james_bond_data.combine_first(
    pd.DataFrame({"Avg_User_IMDB": {10: 7.1}, "Avg_User_Rtn_Tom": {10: 6.8}})
).assign(
    US_Gross=lambda data: (
        data["US_Gross"].replace("[$,]", "", regex=True).astype(float)
    ),
    World_Gross=lambda data: (
        data["World_Gross"].replace("[$,]", "", regex=True).astype(float)
    ),
    Budget=lambda data: (
        data["Budget ($ 000s)"].replace("[$,]", "", regex=True).astype(float)
    ),
)

In [None]:
data[["US_Gross", "World_Gross", "Budget"]].head()
data[["US_Gross", "World_Gross", "Budget"]].info()

In [None]:
data = james_bond_data.combine_first(
    pd.DataFrame({"Avg_User_IMDB": {10: 7.1}, "Avg_User_Rtn_Tom": {10: 6.8}})
).assign(
    US_Gross=lambda data: (
        data["US_Gross"].replace("[$,]", "", regex=True).astype(float)
    ),
    World_Gross=lambda data: (
        data["World_Gross"].replace("[$,]", "", regex=True).astype(float)
    ),
    Budget=lambda data: (
        data["Budget ($ 000s)"].replace("[$,]", "", regex=True).astype(float)
    ),
    Film_Length=lambda data: (
        data["Film_Length"].str.rstrip("mins").astype(int)
    ),
)

data[["Film_Length"]].head()
data[["Film_Length"]].info()

In [None]:
data = james_bond_data.combine_first(
    pd.DataFrame({"Avg_User_IMDB": {10: 7.1}, "Avg_User_Rtn_Tom": {10: 6.8}})
).assign(
    US_Gross=lambda data: (
        data["US_Gross"].replace("[$,]", "", regex=True).astype(float)
    ),
    World_Gross=lambda data: (
        data["World_Gross"].replace("[$,]", "", regex=True).astype(float)
    ),
    Budget=lambda data: (
        data["Budget ($ 000s)"].replace("[$,]", "", regex=True).astype(float)
    ),
    Film_Length=lambda data: (
        data["Film_Length"].str.rstrip("mins").astype(int)
    ),
    Release=lambda data: pd.to_datetime(data["Release"], format="%B, %Y"),
    Release_Year=lambda data: data["Release"].dt.year,
)

data[["Release"]].info()
data[["Release_Year"]].head()

## Fixing Inconsistencies in Data

In [None]:
data = james_bond_data.combine_first(
    pd.DataFrame({"Avg_User_IMDB": {10: 7.1}, "Avg_User_Rtn_Tom": {10: 6.8}})
).assign(
    US_Gross=lambda data: (
        data["US_Gross"].replace("[$,]", "", regex=True).astype(float)
    ),
    World_Gross=lambda data: (
        data["World_Gross"].replace("[$,]", "", regex=True).astype(float)
    ),
    Budget=lambda data: (
        data["Budget ($ 000s)"].replace("[$,]", "", regex=True).astype(float)
        * 1000
    ),
    Film_Length=lambda data: (
        data["Film_Length"].str.rstrip("mins").astype(int)
    ),
    Release=lambda data: pd.to_datetime(data["Release"], format="%B, %Y"),
    Release_Year=lambda data: data["Release"].dt.year,
)

data[["US_Gross", "World_Gross", "Budget"]].head()

## Removing Duplicate Data

In [None]:
data["Movie"].value_counts().head()

In [None]:
duplicate_movies = ["The Man with the Golden Gun", "The Living Daylights"]
data[data["Movie"].isin(duplicate_movies)]

In [None]:
data = (
    james_bond_data.combine_first(
        pd.DataFrame(
            {"Avg_User_IMDB": {10: 7.1}, "Avg_User_Rtn_Tom": {10: 6.8}}
        )
    )
    .assign(
        US_Gross=lambda data: (
            data["US_Gross"].replace("[$,]", "", regex=True).astype(float)
        ),
        World_Gross=lambda data: (
            data["World_Gross"].replace("[$,]", "", regex=True).astype(float)
        ),
        Budget=lambda data: (
            data["Budget ($ 000s)"]
            .replace("[$,]", "", regex=True)
            .astype(float)
        ),
        Film_Length=lambda data: (
            data["Film_Length"].str.rstrip("mins").astype(int)
        ),
        Release=lambda data: pd.to_datetime(data["Release"], format="%B, %Y"),
        Release_Year=lambda data: data["Release"].dt.year,
    )
    .drop_duplicates(ignore_index=True)
)

duplicate_movies = ["The Man with the Golden Gun", "The Living Daylights"]
data[data["Movie"].isin(duplicate_movies)]

## Correcting Spelling Errors

In [None]:
data["Bond"].value_counts()

In [None]:
data = (
    james_bond_data.combine_first(
        pd.DataFrame(
            {"Avg_User_IMDB": {10: 7.1}, "Avg_User_Rtn_Tom": {10: 6.8}}
        )
    )
    .assign(
        US_Gross=lambda data: (
            data["US_Gross"].replace("[$,]", "", regex=True).astype(float)
        ),
        World_Gross=lambda data: (
            data["World_Gross"].replace("[$,]", "", regex=True).astype(float)
        ),
        Budget=lambda data: (
            data["Budget ($ 000s)"]
            .replace("[$,]", "", regex=True)
            .astype(float)
        ),
        Film_Length=lambda data: (
            data["Film_Length"].str.rstrip("mins").astype(int)
        ),
        Release=lambda data: pd.to_datetime(data["Release"], format="%B, %Y"),
        Release_Year=lambda data: data["Release"].dt.year,
        Bond=lambda data: (
            data["Bond"]
            .str.replace("Shawn", "Sean")
            .str.replace("MOORE", "Moore")
        ),
    )
    .drop_duplicates(ignore_index=True)
)

data["Bond"].value_counts()

In [None]:
james_bond_data["Bond_Car_MFG"].value_counts()

In [None]:
data = (
    james_bond_data.combine_first(
        pd.DataFrame(
            {"Avg_User_IMDB": {10: 7.1}, "Avg_User_Rtn_Tom": {10: 6.8}}
        )
    )
    .assign(
        US_Gross=lambda data: (
            data["US_Gross"].replace("[$,]", "", regex=True).astype(float)
        ),
        World_Gross=lambda data: (
            data["World_Gross"].replace("[$,]", "", regex=True).astype(float)
        ),
        Budget=lambda data: (
            data["Budget ($ 000s)"]
            .replace("[$,]", "", regex=True)
            .astype(float)
        ),
        Film_Length=lambda data: (
            data["Film_Length"].str.rstrip("mins").astype(int)
        ),
        Release=lambda data: pd.to_datetime(data["Release"], format="%B, %Y"),
        Release_Year=lambda data: data["Release"].dt.year,
        Bond=lambda data: (
            data["Bond"]
            .str.replace("Shawn", "Sean")
            .str.replace("MOORE", "Moore")
        ),
        Bond_Car_MFG=lambda data: data["Bond_Car_MFG"].str.replace(
            "Astin", "Aston"
        ),
    )
    .drop_duplicates(ignore_index=True)
)

data["Bond_Car_MFG"].value_counts()

## Checking For Invalid Outliers

In [None]:
data[["Film_Length", "Martinis"]].describe()

In [None]:
data = (
    james_bond_data.combine_first(
        pd.DataFrame(
            {"Avg_User_IMDB": {10: 7.1}, "Avg_User_Rtn_Tom": {10: 6.8}}
        )
    )
    .assign(
        US_Gross=lambda data: (
            data["US_Gross"].replace("[$,]", "", regex=True).astype(float)
        ),
        World_Gross=lambda data: (
            data["World_Gross"].replace("[$,]", "", regex=True).astype(float)
        ),
        Budget=lambda data: (
            data["Budget ($ 000s)"]
            .replace("[$,]", "", regex=True)
            .astype(float)
        ),
        Film_Length=lambda data: (
            data["Film_Length"]
            .str.rstrip("mins")
            .astype(int)
            .replace(1200, 120)
        ),
        Release=lambda data: pd.to_datetime(data["Release"], format="%B, %Y"),
        Release_Year=lambda data: data["Release"].dt.year,
        Bond=lambda data: (
            data["Bond"]
            .str.replace("Shawn", "Sean")
            .str.replace("MOORE", "Moore")
        ),
        Bond_Car_MFG=lambda data: data["Bond_Car_MFG"].str.replace(
            "Astin", "Aston"
        ),
        Martinis=lambda data: data["Martinis"].replace(-6, 6),
    )
    .drop_duplicates(ignore_index=True)
)

data[["Film_Length", "Martinis"]].describe()

## Storing Your Cleansed Data

In [None]:
data.to_csv("james_bond_data_cleansed.csv", index=False)

# Using Python for Data Analysis

## Performing a Regression Analysis

In [None]:
!python -m pip install matplotlib scikit-learn

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = pd.read_csv("james_bond_data_cleansed.csv").convert_dtypes()

fig, ax = plt.subplots()
ax.scatter(data["Avg_User_IMDB"], data["Avg_User_Rtn_Tom"])
ax.set_title("Scatter Plot of Ratings")
ax.set_xlabel("Average IMDB Rating")
ax.set_ylabel("Average Rotten Tomatoes Rating")

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

x = data.loc[:, ["Avg_User_IMDB"]]
y = data.loc[:, "Avg_User_Rtn_Tom"]

model = LinearRegression()
model.fit(x, y)

r_squared = f"R-Squared: {model.score(x, y):.2f}"
best_fit = f"y = {model.coef_[0]:.4f}x{model.intercept_:+.4f}"
y_pred = model.predict(x)

fig, ax = plt.subplots()
ax.scatter(x, y)
ax.plot(x, y_pred, color="red")
ax.text(7.25, 5.5, r_squared, fontsize=10)
ax.text(7.25, 7, best_fit, fontsize=10)
ax.set_title("Scatter Plot of Ratings")
ax.set_xlabel("Average IMDB Rating")
ax.set_ylabel("Average Rotten Tomatoes Rating")

## Investigating a Statistical Distribution

In [None]:
length = data["Film_Length"].value_counts(bins=7).sort_index()
length.plot.bar(
    title="Film Length Distribution",
    xlabel="Time Range (mins)",
    ylabel="Count",
)

In [None]:
data["Film_Length"].agg(["mean", "max", "min", "std"])

## Finding No Relationship

In [None]:
fig, ax = plt.subplots()
ax.scatter(data["Avg_User_IMDB"], data["Kills_Bond"])
ax.set_title("Scatter Plot of Kills vs Ratings")
ax.set_xlabel("Average IMDB Rating")
ax.set_ylabel("Kills by Bond")