### Problem description

We're provided with a dataset containing measurements of the mercury in a lake over some period of time. Some of the data is missing. We extract the missing data, and use random forest regression to predict those values.

### Extracting the missing values

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
ds= pd.read_csv('../Mercury_measurement.txt', sep = '\t', header = None)
ds.columns = ['Date','Measure']

In [3]:
ds = ds.values

In [4]:
miss_dates = []
for row in ds:
    if 'M' in str(row[1]):
        miss_dates.append(row[0])

In [5]:
for row in ds:
    a = datetime.strptime(row[0],'%m/%d/%Y %H:%M:%S')
    row[0] = datetime.timestamp(a)

In [6]:
miss_vals = []
for row in ds:
    if 'M' in str(row[1]):
        miss_vals.append(row[0])
        row[1] = np.nan
    else:
        row[1] = float(row[1])

In [7]:
miss_vals = np.array(miss_vals)
miss_vals = miss_vals.reshape(-1,1)

In [8]:
ds = pd.DataFrame(ds)
ds = ds.dropna().values

### Implementing Random Forest

In [9]:
X = ds[:,0:1]
y = ds[:,1:2]

In [10]:
from sklearn.ensemble import RandomForestRegressor

regressor = RandomForestRegressor()
regressor.fit(X,y)

  regressor.fit(X,y)


RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [11]:
predict = regressor.predict(miss_vals)

In [12]:
data = {'Dates' : miss_dates, 'Missing Values' : predict}
ds_miss = pd.DataFrame(data)
ds_miss

Unnamed: 0,Dates,Missing Values
0,3/13/2012 16:00:00,32.5063
1,3/21/2012 16:00:00,32.1718
2,3/26/2012 16:00:00,32.6194
3,5/24/2012 16:00:00,29.4478
4,6/1/2012 16:00:00,29.44514
5,6/4/2012 16:00:00,28.9814
6,7/20/2012 16:00:00,30.5919
7,7/23/2012 16:00:00,29.5407
8,7/26/2012 16:00:00,29.4992
9,9/10/2012 16:00:00,30.9843


We now evaluate our model by comparing our predicted values with the real values of the missing data. 

In [13]:
real_vals = pd.read_csv('..\Mercury_measurement_missing.txt', header = None).values

In [14]:
real_vals = real_vals.reshape(1,-1)

In [15]:
rmse = np.sqrt(np.mean((real_vals - predict)**2))
rmse

0.3753999973998419