# Random Forests for LANL Earthquake Prediction

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Activation, Flatten
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing
from sklearn import utils
import csv

Using TensorFlow backend.


### Converting Data to Atrributes ###

In [None]:
s_deviation = np.empty(0)
averages = np.empty(0)
minimums = np.empty(0)
maximums = np.empty(0)
variances = np.empty(0)

times = np.empty(0)

with open("train.csv") as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    past_time = 0
    X = np.empty(0)
    for row in csv_reader:
        if line_count == 0:
            line_count += 1
        else:
            new_time = int(float(row[1])*1000)
            if(past_time != new_time):
                past_time = new_time
                times = np.append(times,new_time)
                if(len(X) > 0):
                    s_deviation = np.append(s_deviation, X.std())
                    averages = np.append(averages, np.average(X))
                    minimums = np.append(minimums, np.min(X))
                    maximums = np.append(maximums, np.max(X))
                    variances = np.append(variances, np.var(X))

                    X = np.empty(0)
            else:
               X = np.append(X, int(row[0]))
            line_count += 1

times = times[:len(s_deviation)]

print("Done Finding Attributes!")

dataset = pd.DataFrame({'s_deviation' : s_deviation, 'averages' : averages, 'minimums' : minimums, 'maximums' : maximums, 'variances' : variances, 'time' : times})

del s_deviation,averages,minimums,maximums,variances

### Making Training & Test Data ###

In [4]:
y = dataset['time']
X = dataset.drop(['time'], axis = 1)

# Split the dataset to trainand test data

train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.25, random_state=0)

del y,X,dataset

print("Done Making Training Data!")

### Training and Testing RF Model ### 

In [6]:
parameters = {'bootstrap': True,
              'min_samples_leaf': 3,
              'n_estimators': 50, 
              'min_samples_split': 10,
              'max_features': 'sqrt',
              'max_depth': 6,
              'max_leaf_nodes': None}

RF_model = RandomForestClassifier(**parameters)
RF_model.fit(train_X, train_y)

print("Done Training! Testing Now...")

RF_predictions = RF_model.predict(test_X)

test_y = np.array(test_y)

for i in range(len(test_y)):
    test_y[i] = test_y[i]/1000.0
    RF_predictions[i] = RF_predictions[i]/1000.0

score = mean_squared_error(test_y, RF_predictions)

print("I Have An MSE of" + str(score) + "!" + "\nPredicting Data Now...")



NameError: name 'mean_squared_error' is not defined

### Predicting From Raw Data  ###

In [None]:
output = "seg_id,time_to_failure\n"

file_names = sorted(list(glob.glob("./test/*.csv")))
#file_names = file_names[:10]
for file_name in file_names:
    s_deviation = np.empty(0)
    averages = np.empty(0)
    minimums = np.empty(0)
    maximums = np.empty(0)
    variances = np.empty(0)

    test_times = np.empty(0)
    with open(file_name) as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        past_time = 0
        X = np.empty(0)
        for row in csv_reader:
            if line_count == 0:
                line_count += 1
            elif len(X) == 4096:
                s_deviation = np.append(s_deviation, X.std())
                averages = np.append(averages, np.average(X))
                minimums = np.append(minimums, np.min(X))
                maximums = np.append(maximums, np.max(X))
                variances = np.append(variances, np.var(X))
    
                X = np.empty(0)
            else:
                X = np.append(X, int(row[0]))
            line_count += 1

    test_X = pd.DataFrame({'s_deviation' : s_deviation, 'averages' : averages, 'minimums' : minimums, 'maximums' : maximums, 'variances' : variances})

    RF_predictions = RF_model.predict(test_X)
    
    output += file_name.split(".")[1][6:] + "," + str(RF_predictions[len(RF_predictions)-1]/1000.0) + "\n"

output_file = open("test_results_RF.csv", "w+")
output_file.write(output)
output_file.close()

print("Predicting Done!")