## 03 - My Cookbook

### Data Cleaning

In [1]:
import pandas as pd
import numpy as np

### Checking and handling missing values in the data

In [2]:
df = pd.read_csv("./data/landslides.csv")
df.head()

Unnamed: 0,id,date,time,country_name,state/province,population,landslide_type,trigger,fatalities
0,34,3/2/07,Night,United States,Virginia,16000,Landslide,Rain,
1,42,3/22/07,,United States,Ohio,17288,Landslide,Rain,
2,56,4/6/07,,United States,Pennsylvania,15930,Landslide,Rain,
3,59,4/14/07,,Canada,Quebec,42786,Riverbank collapse,Rain,
4,61,4/15/07,,United States,Kentucky,6903,Landslide,Downpour,0.0


In [None]:
df.info()

In [None]:
df.describe()

### Handling missing values in the data

- Drop rows where you have missing values in the date column.
- Fill missing values with specific values
- Fill automatic

In [None]:
# This code is with the Pandas library to count the number of missing (NaN or None) values in each column of a dataframe.

df.isna().sum()

In [None]:
# The code df = df[~df['date'].isnull()] is used to filter a Pandas DataFrame (df) to remove rows where the 'date' 
# column has missing (NaN) values.

# ~df['date'].isnull(): The ~ operator is used to negate the Boolean mask. So, now it will be True for rows where 
# 'date' is not NaN and False for rows where 'date' is NaN.

df = df[~df['date'].isnull()]
df

In [None]:
df.isna().sum()

In [None]:
df['time'] = df['time'].fillna("Not known!")

In [None]:
df

In [None]:
df['time'].value_counts()

In [None]:
df['date'].isna().sum()

In [None]:
df['fatalities'].isna().sum()

In [None]:
df['fatalities'].plot();

In [None]:
mean = df['fatalities'].mean()
mean

In [None]:
df['fatalities'].isna().sum()

### Parse dates 
- Convert our date columns to datetime
- extract month from the date
- Plot a histogram to show the number of landslides in every month

In [None]:
df.info()

In [None]:
# Create a new column, date_parsed, with the parsed dates

df['date_parsed'] = pd.to_datetime(df['date'], format="%m/%d/%y")

In [None]:
df['date_parsed']

In [None]:
df['date_parsed'].head()

In [None]:
df.info()

In [None]:
# Get the day of the month from the date_parsed column
month_of_landslides = df['date_parsed'].dt.month

month_of_landslides.head()


In [None]:
month_of_landslides

In [None]:
import seaborn as sns

# Bofore we plot we need to remove na's
month_of_landslides = month_of_landslides.dropna()


In [None]:
# sns.distplot(): This function from Seaborn is used to create a distribution plot, which can include a histogram and 
# a kernel density estimate (KDE) by default.

# month_of_landslides: This is assumed to be a variable or a Pandas Series that contains the data you want to visualize.

# kde=False: This argument specifies that you do not want to include the kernel density estimate in the plot. If you set 
# kde=True, it would include a smoothed curve representing the estimated probability density function.

# bins=12: This argument specifies the number of bins to use for the histogram. In this case, it's set to 12, meaning the 
# data will be divided into 12 intervals, and the histogram will show the distribution of data points across these intervals.

# Plot the day of the month
sns.distplot(month_of_landslides, kde=False, bins=12);


### Correcting the data format

In [None]:
df

In [None]:
df['time'].value_counts()[0:10]

In [None]:
def format_time(x):
    if ':' in x.lower():
        if int(x.split(':')[0]) >= 12 and int(x.split(':')[0]) < 18:
            x = "Afternoon"
        elif int(x.split(':')[0]) < 12:
            x = "Morning"
        elif int(x.split(':')[0]) >= 18:
            x = "Night"
    

    return x

In [None]:
def format_time(x):
    if ':' in x.lower():
        if int(x.split(':')[0]) >= 12 and int(x.split(':')[0]) < 18:
            x = "Afternoon"
        elif int(x.split(':')[0]) < 12:
            x = "Morning"
        elif int(x.split(':')[0]) >= 18:
            x = "Night"
    elif 'evening' in x.lower():
        x = "Evening"
    elif 'morning' in x.lower() or 'dawn' in x.lower():
        x = "Morning"
    elif 'afternoon' in x.lower():
        x = "Afternoon"
    elif 'night' in x.lower():
        x = "Night"
    else:
        x = "Not Known!"
    return x

In [None]:
df['time'] = df['time'].apply(format_time)

In [None]:
df['time'].value_counts()

In [None]:
df