In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import datetime

earthquakes = pd.read_csv("../input/earthquake-database/database.csv")
landslides = pd.read_csv("../input/landslide-events/catalog.csv")
volcanos = pd.read_csv("../input/volcanic-eruptions/database.csv")

# set seed for reproducibility
np.random.seed(0)

In [None]:
# check the data type of the Date column in the earthquakes dataframe
#check numpy documentation to match letter code to dypte of the object
#'O' is the code for object

earthquakes['Date'].dtype

In [None]:
landslides['date'].head(1)

In [None]:
# parsing dates: take in a string and identify its component parts
#tell pandas what the format of our dates are with a . 'strftime directive'
#point out which parts of the date are where, and what punctuation is between them
#example: 17-1-2007 has format"%d-%m-%Y" (capital Y is four digit year)
#create a new column, date_parsed, with the parsed dates

landslides['date_parsed'] = pd.to_datetime(landslides['date'], format = "%m/%d/%y")

In [None]:
# print the first few rows
landslides['date_parsed'].head()

In [None]:
#parse dates from earthquake
#have to use infer datetime format because there are multiple date formats in the column
#don't always use it because it's not always correct, and it's very slow

earthquakes['date_parsed'] = pd.to_datetime(earthquakes['Date'], infer_datetime_format=True)
earthquakes['date_parsed'].head(1)


In [None]:
# now that our date are in proper datetime format, we can use datetime to call specific values from it
# get the day of the month from the date_parsed column
day_of_month_landslides = landslides['date_parsed'].dt.day

In [None]:
# get the day of the month from the date_parsed column
day_of_month_earthquakes = earthquakes['date_parsed'].dt.day

# Plot the day of the month to check the date parsing
___

One of the biggest dangers in parsing dates is mixing up the months and days. The to_datetime() function does have very helpful error messages, but it doesn't hurt to double-check that the days of the month we've extracted make sense. 

To do this, let's plot a histogram of the days of the month. We expect it to have values between 1 and 31 and, since there's no reason to suppose the landslides are more common on some days of the month than others, a relatively even distribution. (With a dip on 31 because not all months have 31 days.) Let's see if that's the case:

In [None]:
# can make sure we didn't mix up days and months by plotting a histogram; expect values to be b/w 1 and 31
# remove na's
day_of_month_earthquakes = day_of_month_earthquakes.dropna()

# plot the day of the month
sns.distplot(day_of_month_earthquakes, kde=False, bins=31)