In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
data = pd.read_csv("../input/database.csv")
data.head()

# Convert categorial data to numeric

In [None]:
data['Magnitude Type'] = pd.factorize(data['Magnitude Type'])[0] + 1
data = data[data['Type'] == 'Earthquake'] # Only take earthquakes into account

# Ignore irrelevant data

In [None]:
data = data[['Date', 'Time', 'Latitude', 'Longitude','Depth', 'Magnitude']]
data.head()

# Convert timestamps to a simpler numeric format

In [None]:
import datetime
import time

timestamp = []
for d, t in zip(data['Date'], data['Time']):
    try:
        ts = datetime.datetime.strptime(d+' '+t, '%m/%d/%Y %H:%M:%S')
        timestamp.append(time.mktime(ts.timetuple()))
    except ValueError:
        timestamp.append('ValueError')

In [None]:
timeStamp = pd.Series(timestamp)
data['Timestamp'] = timeStamp.values
final_data = data.drop(['Date', 'Time'], axis=1)
final_data = final_data[final_data.Timestamp != 'ValueError']
final_data.head()

# Draw a map based on longtitude and latitude of all known occourances.

In [None]:
from mpl_toolkits.basemap import Basemap

m = Basemap(projection='mill',llcrnrlat=-80,urcrnrlat=80, llcrnrlon=-180,urcrnrlon=180,lat_ts=20,resolution='c')

longitudes = data["Longitude"].tolist()
latitudes = data["Latitude"].tolist()
x,y = m(longitudes,latitudes)
fig = plt.figure(figsize=(12,10))
plt.title("All affected areas")
m.plot(x, y, "o", markersize = 2, color = 'blue')
m.drawcoastlines()
m.fillcontinents(color='coral',lake_color='aqua')
m.drawmapboundary()
m.drawcountries()
plt.show()

# Split the data into X->features and y->targets

In [None]:
features = ['Timestamp', 'Latitude', 'Longitude']
targets = ['Magnitude', 'Depth']
X = final_data[features]
y = final_data[targets]

# Split the data to test and train portaions

In [None]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a model, and predict on the test data

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import metrics

best_model = None
best_score = 0

# I've tried running this model until the error rate stops improving, but it takes forever, so I'll revert it back to a finite loop first.
epsilon = 0.01
old_error = float('inf')
i = 1
while True:
    estimators = i*4
    model = RandomForestRegressor(n_estimators = estimators, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)

    r2 = r2_score(y_test, y_pred)
    score = model.score(X_test, y_test)
    print("n_estimators={}:".format(estimators))
    print("\tMean Squared Error:", mse, end = ", ")
    print("\tR2 score:", r2, end = ", ")
    print("\tModel score:", score)
    if abs(old_error - mse) <= epsilon:
        print(old_error)
        print(mse)
        print(old_error - mse)
        print("No improvement")
        break
    old_error = mse
    if best_score < score:
        best_score = score
        best_model = model
    i += 1

# Model score

In [None]:
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
# max_err = max_error(y_pred, y_test)
r2 = r2_score(y_test, y_pred)
score = best_model.score(X_test, y_test)
print("Mean Squared Error:", mse)
print("R2 score:", r2)
print("Model score:", score)

In [None]:
# # Create PNG files for each decision tree. Since each tree's depth is roughly 20, they're hard to visualize and harder to understand.
# # Lowering the max depth hurts the model score, as expected.
# # Running this bracket takes a long time.(Over an hour)

# from graphviz import render
# from IPython.display import Image
# from sklearn.tree import export_graphviz
# from subprocess import call
# for i, tree in enumerate(model.estimators_):
#     print(i)
#     export_graphviz(tree, out_file='{}.dot'.format(i), 
#                 feature_names = features,
#                 class_names = targets,
#                 rounded = True, proportion = False, 
#                 precision = 2, filled = True)

#     call(['dot', '-Tpng', '{}.dot'.format(i), '-o', '{}.png'.format(i), '-Gdpi=600'])
