In [None]:
from __future__ import print_function

from pyspark.mllib.tree import RandomForest, RandomForestModel
from pyspark.mllib.linalg import SparseVector, Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.sql import SQLContext
from pyspark.ml.feature import StringIndexer,VectorAssembler
from pyspark.ml import Pipeline
from pyspark.ml.linalg import DenseVector
import time
import sys
from operator import add
import pyspark
import os
import numpy as np

In [None]:
def convert(lines,data_group,predict):
    line = lines.replace("[","").replace("]","").replace("'","").split(',')
    
    if predict == 'departure':
        target = float(line[12])
    else:
        target = float(line[13])
        
    # Month (idx: 1) categorical (idx 0)
    # Day_of_month (idx:2) numerical (idx 1)
    # day_of_week, (idx:3) categorical (idx 2)
    # Op_unique_carrier,(4) categorical (idx 3)
    # Tail_num (5) categorical (idx 4)
    # Op_carrier_fl_num (6) categorical (idx 5)
    # Origin (7) categorical (idx 6)
    # Dest (8) categorical (idx 7)
    # Crs_dep_time (9) numerical (idx 8)
    # Crs_arr_time (14) numerical (idx 9)
    # Csr_elapsed_time, (20) numerical (idx 10)
    # Distance (23) numerical (idx 11)
    # dep_delay(target) (11) numerical (idx 12)
    # arr_delay(target) (16) numerical (idx 13)
        
    return ((target,line[0],int(line[1]),line[2],line[3],line[4],line[5],line[6],line[7],int(line[8]),int(line[9]),float(line[10]),float(line[11]),data_group))    
  

def get_train_test(file_path,library,predict):
    
    # Reading train and test data and convert them
    raw_train = sc.textFile("file://"+file_path[0])
    train_conv = raw_train.map(lambda x : convert(x,'train',predict))
    
    raw_test = sc.textFile("file://"+file_path[1])
    test_conv = raw_test.map(lambda x : convert(x,'test',predict))

    # Merge the train and test data before converting categorical_features into numerical category and create dataframe
    raw_all = sc.union([train_conv, test_conv])
    df = sqlContext.createDataFrame(raw_all, schema = ['label', '0','1','2','3','4','5','6','7','8','9','10','11','data_group'])
    
    # Define which columns are numerical and which are categorical
    numerical_features = ['1','8','9','10','11']
    categorical_features = ['0', '2', '3', '4', '5', '6', '7']
    
    
    stages = [] # to save all stages for the Pipeline
    
    # Convert categorical features into numerical category using stringIndexer and save the stages for pipeline
    for categoricalCol in categorical_features:
        stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol + "Indexer")
        stages += [stringIndexer]
    
    # will group all the features into one vector : categorical_features(which have been converted to number) + numerical_features
    assemblerInputs = [x + "Indexer" for x in categorical_features] + numerical_features
    assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
    stages += [assembler]
    
    # Create a Pipeline, and transform the dataframe with above changes listed in array stages
    pipeline = Pipeline(stages=stages)
    pipelineModel = pipeline.fit(df)
    df_model = pipelineModel.transform(df)
    
    # split back into train and test dataframes by filtering train and test
    df_train = df_model.filter("data_group=='train'").select('label','features') 
    df_test = df_model.filter("data_group=='test'").select('label','features')
    
    #if using mllib library, convert dataframe to RDD LabeledPoint for train and test data
    if library == 'mllib':
        data_LP_train = df_train.rdd.map(lambda x: LabeledPoint(x[0], x[1].toArray()))
        data_LP_test = df_test.rdd.map(lambda x: LabeledPoint(x[0], x[1].toArray()))
        return data_LP_train,data_LP_test
    
    #if using ml library
    else:
        return df_model2
    #

def mllib_random_forest(max_depth,num_trees,train_set):
    #Random Forest Regressor implementation
    model_rf = RandomForest.trainRegressor(train_set, categoricalFeaturesInfo={},
                                        numTrees=num_trees, featureSubsetStrategy="auto",
                                        impurity='variance', maxDepth=max_depth)
 
    return model_rf
 

In [None]:

if __name__ == "__main__":
    
    # default on what to predict, can be set to 'arrival'
    predict = 'departure'
    library = 'mllib'
    
    #training and test input data path, index 0 = train, index 1 = test
    input_path = ["/mnt/wiktorskit-jungwonseo-ns0000k/home/notebook/group03/Predict-Delay/Dataset/cleaned_train_whole.txt",\
                  "/mnt/wiktorskit-jungwonseo-ns0000k/home/notebook/group03/Predict-Delay/Dataset/cleaned_test_2000.txt"]
    output_path = "/mnt/wiktorskit-jungwonseo-ns0000k/home/notebook/group03/Renny-temp/predictions01"
    
    if not os.path.isfile(input_path[0]) or not os.path.isfile(input_path[1]) or "/mnt/" not in input_path[0] or "/mnt/" not in input_path[1] or len(output_path)==0:
        print("Please check your input path again")
        sys.exit(-1)

    sc = pyspark.SparkContext.getOrCreate()
    sqlContext = SQLContext(sc)
    
    print('converting train and test data ..')
    train,test = get_train_test(input_path,library,predict)
    print('converting data done')
    
    runtime = []
    rmse = []
    
    for i in range(3):   
        if library == 'mllib':
            print('Building random forest model ..')
            time_start=time.time()
            model_rf = mllib_random_forest(5,1000,train)


        print('Run prediction to test data')
        predictions = model_rf.predict(test.map(lambda x: x.features))
        labelsAndPredictions = test.map(lambda lp: lp.label).zip(predictions)

        testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
            float(test.count())

        time_end=time.time()
        time_rf=(time_end - time_start)
        print("Build Random Forest and do predictions takes %d s" %(time_rf))
        
        runtime.append(time_rf)

        print('Test Mean Squared Error = ' + str(testMSE**0.5))

        # first metrics
        from pyspark.mllib.evaluation import RegressionMetrics

        # Instantiate metrics object
        metrics = RegressionMetrics(labelsAndPredictions)

        # Squared Error
        print("MSE = %s" % metrics.meanSquaredError)
        rmse.append(metrics.rootMeanSquaredError)
        print("RMSE = %s" % metrics.rootMeanSquaredError)
        
    print ('Average running time =',np.mean(runtime))
    print ('Average RMSE =', np.mean(rmse))




In [None]:
sc.stop()