# Detecting Missing Values

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv('weather_m4.csv')
df.info()

In [None]:
df[['MIN_TEMP_GROUND', 'VIEW_RANGE', 'CLOUD', 'WEATHER_CODE']].head(15)

In [None]:
df.isnull()

In [None]:
# Show all columns with null values
df.isnull().any()

In [None]:
df.isnull().any(axis=1)

In [None]:
# Show all rows with null values
df[df.isnull().any(axis=1)]

In [None]:
# Are there any rows with only null values?
df.isnull().all(axis=1).any()

In [None]:
# Are there any columns with no null values at all?
df.notnull().all()

In [None]:
# Hmm.. seems like this column only has a value every 6th row.. let's check this
df['MIN_TEMP_GROUND']

In [None]:
# Create a series containing indices for every 6th row
every_6th_row = pd.Series(range(5, len(df), 6))

In [None]:
# Are all these rows NOT null?
df['MIN_TEMP_GROUND'][every_6th_row].notnull().all()

In [None]:
# Are all other rows null?
# Q: Can you rewrite this line to use df.loc?
df['MIN_TEMP_GROUND'].drop(every_6th_row).isnull().all()

# Handling Missing Values

In [None]:
df.info()

In [None]:
# df.drop can be used to remove columns and/or rows
df.drop(columns='WEATHER_CODE', inplace=True)

In [None]:
df.info()

In [None]:
# We can use fillna() to fill in missing data based on the data that is present
df['MIN_TEMP_GROUND'].fillna(method='bfill', inplace=True)

In [None]:
# Now that we have no more nulls in MIN_TEMP_GROUND
# what are the dates where missing values occur?
df.loc[df.isnull().any(axis=1), 'YYYYMMDD'].drop_duplicates()

In [None]:
# Shortest solution: Just drop everything
nulls_dropped = df.dropna()
nulls_dropped.info()

In [None]:
# But note that our index is now discontinuous
nulls_dropped[5300:5310]

In [None]:
# Another idea: just drop rows that have less than 7 columns filled
# This leaves us with only two rows that contain null values
drop_thresh = df.dropna(thresh=7)
drop_thresh[drop_thresh.isnull().any(axis=1)]

In [None]:
# Or let's just look at the missing data again..
rows_to_fill = df.isnull().any(axis=1)
df[rows_to_fill]

In [None]:
df.mean()

In [None]:
# We might fill all null values with the mean of the corresponding column
nulls_filled = df.fillna(df.mean())

In [None]:
# Let's look at the result
nulls_filled[rows_to_fill]

In [None]:
# Or you could fill the null values with the mode
df.fillna(df.mode().iloc[0], inplace=True)

# Removing Outliers

In [None]:
athletes = pd.read_csv('athletes.csv')
athletes.info()

In [None]:
%matplotlib inline
athletes.plot.scatter(x='height', y='weight')

In [None]:
heights = athletes['height']
heights.plot.box()

In [None]:
q1 = heights.quantile(.25)
q3 = heights.quantile(.75)
iqr = q3 - q1 
pmin = q1 - 1.5 * iqr
pmax = q3 + 1.5 * iqr
nwh = heights.where(heights.between(pmin, pmax))

In [None]:
compare = pd.DataFrame({'before':heights, 'after':nwh})
compare.plot.box()
compare.describe()

In [None]:
heights.where(heights.between(pmin, pmax), inplace=True)

In [None]:
athletes.plot.scatter(x='height', y='weight')

# Removing Duplicates

In [None]:
athletes.duplicated().any()

In [None]:
athletes[athletes.duplicated()]

In [None]:
athletes.drop_duplicates(inplace=True)

In [None]:
athletes['nationality'].drop_duplicates().sort_values()

In [None]:
athletes['nationality'].value_counts()

In [None]:
athletes['sex'].value_counts()

# Converting Types

In [None]:
athletes.info()

In [None]:
athletes[['gold', 'silver', 'bronze']].head()

In [None]:
athletes[athletes['gold'] == 'O']

In [None]:
athletes.loc[7521, ['gold', 'silver', 'bronze']] = 0

In [None]:
athletes[['gold', 'silver', 'bronze']] = athletes[['gold', 'silver', 'bronze']].astype(int)

In [None]:
athletes[['gold', 'silver', 'bronze']].sum()

In [None]:
athletes.info()

# Fixing Indexes

In [None]:
athletes.head()

In [None]:
athletes.set_index('id', drop=True, inplace=True)
athletes.head()

In [None]:
athletes.rename(
    columns={"nationality": "country", "sport": "discipline"}, 
    inplace=True)
athletes.head()

In [None]:
df = pd.read_csv('weather_m4.csv')

In [None]:
df.dropna(inplace=True)
df.info()

In [None]:
df.head()

In [None]:
df.reset_index(drop=True)