In [None]:
# Exploratory Data Analysis in Python
# Chapter 1 - Getting to know a Dataset
# Functions for initial exploration

# Functions for initial exploration
# Print the first five rows of unemployment
print(unemployment.head())

# Functions for initial exploration
# Print a summary of non-missing values and data types in the unemployment DataFrame
print(unemployment.info())

# Functions for initial exploration
# Print summary statistics for numerical columns in unemployment
print(unemployment.describe())

# Counting categorical values
# Count the values associated with each continent in unemployment
print(unemployment.value_counts('continent'))

#Global unemployment in 2021
# Import the required visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Create a histogram of 2021 unemployment; show a full percent in each bin
sns.histplot(data=unemployment, x='2021', binwidth=1)
plt.show()

# Data Validation
# Detecting data types

# Update the data type of the 2019 column to a float
unemployment["2019"] = unemployment['2019'].astype(float)

# Print the dtypes to check your work
print(unemployment.dtypes)

# Validating continents

# Define a Series describing whether each continent is outside of Oceania
not_oceania = ~unemployment['continent'].isin(['Oceania'])

# Define a Series describing whether each continent is outside of Oceania
not_oceania = ~unemployment["continent"].isin(["Oceania"])

# Print unemployment without records related to countries in Oceania
print(unemployment[not_oceania])

# Validating range

# Print the minimum and maximum unemployment rates during 2021
print(unemployment['2021'].min(), unemployment['2021'].max())

# Create a boxplot of 2021 unemployment rates, broken down by continent
sns.boxplot(data=unemployment, x='2021', y='continent')
plt.show()

# Summaries with .groupby() and .agg()

# Print yearly mean and standard deviation grouped by continent
print(unemployment.groupby('continent').agg(['mean', 'std']))

# Named aggregations

continent_summary = unemployment.groupby("continent").agg(
    # Create the mean_rate_2021 column
    mean_rate_2021=('2021', 'mean'),
    # Create the std_rate_2021 column
    std_rate_2021=('2021', 'std')
)
print(continent_summary)

# Visualizing categorical summaries

# Create a bar plot of continents and their average unemployment
sns.barplot(data=unemployment, x='continent', y='2021')
plt.show()


In [None]:
# Exploratory Data Analysis in Python
# Chapter 2 - Data Cleaning and Imputation

# Dealing with missing data
# Count the number of missing values in each column
print(planes.isna().sum())

# Dealing with missing data
# Count the number of missing values in each column
print(planes.isna().sum())

# Find the five percent threshold
threshold = len(planes) * .05
print(threshold)

# Dealing with missing data
# Count the number of missing values in each column
print(planes.isna().sum())

# Find the five percent threshold
threshold = len(planes) * 0.05

# Create a filter
cols_to_drop = planes.columns[planes.isna().sum() <= threshold]
print(cols_to_drop)

# Drop missing values for columns below the threshold
planes.dropna(subset=cols_to_drop, inplace=True)

print(planes.isna().sum())

# Strategies for remaining missing data
# Check the values of the Additional_Info column
print(planes['Additional_Info'].value_counts())

# Check the values of the Additional_Info column
print(planes["Additional_Info"].value_counts())

# Create a box plot of Price by Airline
sns.boxplot(data=planes, x='Airline', y='Price')

plt.show()

# Imputing missing plane prices
# Calculate median plane ticket prices by Airline
airline_prices = planes.groupby("Airline")["Price"].median()

print(airline_prices)

# Convert to a dictionary
prices_dict = airline_prices.to_dict()

# Calculate median plane ticket prices by Airline
airline_prices = planes.groupby("Airline")["Price"].median()

print(airline_prices)

# Convert to a dictionary
prices_dict = airline_prices.to_dict()

# Map the dictionary to missing values of Price by Airline
planes["Price"] = planes["Price"].fillna(planes['Airline'].map(prices_dict))

# Check for missing values
print(planes.isna().sum())


In [None]:
# Exploratory Data Analysis in Python
# Chapter 3 - Relationship in Data
# Visualizing relationships over time
# Define the marriage_year column
divorce["marriage_year"] = divorce['marriage_date'].dt.year 

# Define the marriage_year column
divorce["marriage_year"] = divorce["marriage_date"].dt.year

# Create a line plot showing the average number of kids by year
sns.lineplot(data=divorce, x='marriage_year', y='num_kids' )
plt.show()

# Visualizing variable relationships
# Create the scatterplot
sns.scatterplot(data=divorce, x='marriage_duration', y='num_kids')
plt.show()

# Visualizing multiple variable relationships
# Create a pairplot for income_woman and marriage_duration
sns.pairplot(data=divorce, vars=['income_woman' , 'marriage_duration'])
plt.show()

# Categorical data in scatter plots


In [None]:
# Exploratory Data Analysis in Python
# Chapter 4 - Turning Exploratory Analysis into Action
