## Reading Data From CSV Files

In [None]:
! python -m pip install pandas

In [None]:
import pandas as pd

james_bond_df = pd.read_csv("james_bond_data.csv")
james_bond_df.head()

## Reading Data From Other Sources

### Reading Excel

In [None]:
! python -m pip install openpyxl

In [None]:
import openpyxl
import pandas as pd

james_bond_df_excel = pd.read_excel("james_bond_data.xlsx")
james_bond_df_excel.head()

### Reading JSON

In [None]:
james_bond_df_json = pd.read_json("james_bond_data.json")
james_bond_df_json.head()

### Scraping HTML

In [None]:
!python -m pip install lxml

In [None]:
james_bond_df_html = pd.read_html(
    "https://en.wikipedia.org/wiki/List_of_James_Bond_novels_and_short_stories"
)
james_bond_df_html[1].head()

### Reading Parquet

In [None]:
!python -m pip install pyarrow

In [None]:
james_bond_df_parquet = pd.read_parquet("james_bond_data.parquet")
james_bond_df_parquet.head()

## Dealing With Missing Data and Invalid Data Types

In [None]:
james_bond_df.info()

In [None]:
james_bond_df[james_bond_df.isna().any(axis="columns")]

In [None]:
james_bond_df.at[10, "Avg_User_IMDB"] = 7.1
james_bond_df.at[10, "Avg_User_Rtn_Tom"] = 6.8

In [None]:
james_bond_df[["US_Gross", "World_Gross", "Budget ($ 000s)", "Film_Length"]].head()

In [None]:
james_bond_df["US_Gross"] = (
    james_bond_df["US_Gross"].replace("[$,]", "", regex=True)
).astype(float)

james_bond_df["World_Gross"] = (
    james_bond_df["World_Gross"].replace("[$,]", "", regex=True)
).astype(float)

james_bond_df["Budget ($ 000s)"] = (
    james_bond_df["Budget ($ 000s)"].replace("[$,]", "", regex=True)
).astype(float)

In [None]:
james_bond_df["Film_Length"] = (
    james_bond_df["Film_Length"].str.rstrip("mins").astype(int)
)

In [None]:
james_bond_df["Release"] = pd.to_datetime(james_bond_df["Release"], format="%B, %Y")
james_bond_df["Release_Year"] = james_bond_df["Release"].dt.year

## Dealing With Inconsistencies in Data

In [None]:
james_bond_df["Budget ($ 000s)"] = james_bond_df["Budget ($ 000s)"] * 1000
james_bond_df.rename(columns={"Budget ($ 000s)": "Budget"}, inplace=True)

## Removing Duplicate Data

In [None]:
james_bond_df["Movie"].value_counts().head()

In [None]:

duplicate_movies = ["The Man with the Golden Gun", "The Living Daylights"]
james_bond_df[james_bond_df["Movie"].isin(duplicate_movies)]

In [None]:
james_bond_df.drop_duplicates(inplace=True, ignore_index=True)

## Removing Typos

In [None]:
james_bond_df["Bond"].value_counts()

In [None]:
james_bond_df["Bond"] = james_bond_df["Bond"].str.replace("Shawn", "Sean")
james_bond_df["Bond"] = james_bond_df["Bond"].str.replace("MOORE", "Moore")

In [None]:
james_bond_df["Bond"].value_counts()

In [None]:
james_bond_df["Bond_Car_MFG"].value_counts()

In [None]:
james_bond_df["Bond_Car_MFG"] = james_bond_df["Bond_Car_MFG"].str.replace("Astin", "Aston")

## Checking for Invalid Outliers

In [None]:
james_bond_df[["Film_Length", "Martinis"]].describe()

In [None]:
james_bond_df["Film_Length"].replace(1200, 120, inplace=True)
james_bond_df["Martinis"].replace(-6, 6, inplace=True)

## Storing Your Cleansed Data

In [None]:
james_bond_df.to_csv("james_bond_data_cleansed.csv", index=False)

## Using Python for Data Analysis

In [None]:
!python -m pip install matplotlib scikit-learn numpy

## Performing a Regression Analysis

In [None]:
import matplotlib.pyplot as plt

#x = james_bond_df["Avg_User_IMDB"].values.reshape(-1, 1)
#y = james_bond_df["Avg_User_Rtn_Tom"].values.reshape(-1, 1)

x = james_bond_df["Avg_User_IMDB"].array.reshape(-1, 1)
y = james_bond_df["Avg_User_Rtn_Tom"].array.reshape(-1, 1)

plt.title("Scatter Plot of Ratings.")
plt.xlabel("Average IMDb Rating")
plt.ylabel("Average Rotten Tomatoes Rating")
plt.scatter(x, y)

In [None]:
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

x = james_bond_df["Avg_User_IMDB"].array.reshape(-1, 1)
y = james_bond_df["Avg_User_Rtn_Tom"].array.reshape(-1, 1)

model = LinearRegression()
model.fit(x, y)

r_squared = f"R-Squared: {round(model.score(x, y),2)}"
best_fit_equation = f"y={round(model.coef_[0][0], 4)}x{round(model.intercept_[0], 4)}"
y_pred = model.predict(x)

plt.title("Scatter Plot of Ratings.")
plt.xlabel("Average IMDb Rating")
plt.ylabel("Average Rotten Tomatoes Rating")
plt.scatter(x, y)
plt.text(7.25, 5.5, r_squared, fontsize=10)
plt.text(7.25, 7, best_fit_equation, fontsize=10)
plt.plot(x, y_pred, color="red")

## Investigating a Statistical Distribution

In [None]:
film_length_groups = james_bond_df["Film_Length"].value_counts(bins=7, sort=False)
film_length_groups.plot(kind="bar", title="Film Length Distribution").set(
    xlabel="Time Range (mins)", ylabel="Count"
)

In [None]:
james_bond_df["Film_Length"].agg(["mean", "max", "min", "std"])

## Finding No Relationship

In [None]:
x = james_bond_df["Avg_User_IMDB"].array.reshape(-1, 1)
y = james_bond_df["Kills_Bond"].array.reshape(-1, 1)

plt.title("Scatter Plot of Kills vs Ratings.")
plt.xlabel("Average IMDb Rating")
plt.ylabel("Kills By Bond")
plt.scatter(x, y)