In [1]:
import time
import os
import pprint
import logging
from pyspark import SparkContext, SQLContext

from code.common import *

In [2]:
!pkill -9 java

In [2]:
from pyspark import SparkContext, SQLContext
#sc = SparkContext.getOrCreate()
#sqlContext = SQLContext(sc)
ss = SparkSession.builder\
     .config('spark.executor.memory',       '4G')\
     .config('spark.driver.memory',        '40G')\
     .config('spark.driver.maxResultSize', '10G')\
     .getOrCreate()
sc = ss.sparkContext
sqlContext = SQLContext(sc)

In [3]:
logging.getLogger("py4j").setLevel(logging.ERROR)

In [10]:
for file in os.listdir('data'):
    if 'parquet.normed.masked-060000.encode.picked-000987.packed' in file:
        print(file)

train.parquet.normed.masked-060000.encode.picked-000987.packed-001000.oversampled
tests.parquet.normed.masked-060000.encode.picked-000987.packed-001000
train.parquet.normed.masked-060000.encode.picked-000987.packed-001000
valid.parquet.normed.masked-060000.encode.picked-000987.packed-001000


## Oversample Data

In [None]:
train_file = 'data/train.parquet.normed.masked-060000.encode.picked-000987.packed-001000.oversampled'
dev_file = 'data/valid.parquet.normed.masked-060000.encode.picked-000987.packed-001000'
test_file = 'data/tests.parquet.normed.masked-060000.encode.picked-000987.packed-001000'

In [7]:
train = sqlContext.read.parquet(train_file)

#### Count the distribution across the labels

In [12]:
label_counts_df = train.groupby('label').count().toPandas()
label_counts_df

Unnamed: 0,label,count
0,1,9394612
1,0,27274974


In [13]:
negative_count = label_counts_df.at[0, 'count']
positive_count = label_counts_df.at[1, 'count']

In [15]:
total_count = positive_count + negative_count
print(f'Total number of train examples is {total_count}')
print (f'Percentage of negative examples is {negative_count*100/total_count}')
print (f'Percentage of postive examples is {positive_count*100/total_count}')

Total number of train examples is 36669586
Percentage of negative examples is 25.619629302605162
Percentage of postive examples is 74.38037069739484


In [16]:
extra_positives = train.filter(train['label']==1).sample(withReplacement=True, fraction=1.9)

In [17]:
extra_positives.show(5)

+-----+--------------------+------------------+
|label|            features|            weight|
+-----+--------------------+------------------+
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
+-----+--------------------+------------------+
only showing top 5 rows



##### Add the extra positive examples and shuffle

In [18]:
oversampled_train = train.union(extra_positives)
oversampled_train = oversampled_train.sample(withReplacement = False, fraction = 1.0)

In [19]:
oversampled_train.write.parquet('data/train.parquet.normed.masked-060000.encode.picked-000987.packed.oversampled')

#### Tree Based Models

In [4]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, DecisionTreeClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [5]:
train_file = 'data/train.parquet.normed.masked-060000.encode.picked-000987.packed-001000.oversampled'
dev_file = 'data/valid.parquet.normed.masked-060000.encode.picked-000987.packed-001000'
test_file = 'data/tests.parquet.normed.masked-060000.encode.picked-000987.packed-001000'

In [5]:
train = sqlContext.read.parquet(train_file)

In [None]:
estimator = DecisionTreeClassifier(labelCol="label", featuresCol = "features")
    
print('Starting training')
start_train = time()
model = estimator.fit(train)
end_train = time()
print('Finished training')

#### Decison Tree Classifier

In [14]:
def train_estimator(estimator, train_file, dev_file, test_file, labelCol="label", featuresCol="features", *args, **kwargs):
    
    train = sqlContext.read.parquet(train_file)
    
    algorithm_name = estimator.__name__
    
    estimator = estimator(labelCol=labelCol, featuresCol = featuresCol, *args, **kwargs)
    
    print('Starting training')
    start_train = time()
    model = estimator.fit(train)
    end_train = time()
    print('Finished training')
    
    dev = sqlContext.read.parquet(dev_file)
    
    print('Making predictions')
    train_predictions = model.transform(train)
    dev_predictions = model.transform(dev)
    test_predictions = model.transform(test)
    end_predictions = time()
    print('Compeleted making predictions')
    
    evaluator = BinaryClassificationEvaluator()

    # Calculate the AUC for train and dev
    auc_train = evaluator.evaluate(train_predictions)
    auc_dev = evaluator.evaluate(test_predictions)
    auc_test = evaluator.evaluate(test_predictions)

    print(f'{algorithm_name} - AUC on train is: {auc_train * 100:.2f}')
    print(f'{algorithm_name} - AUC on dev is: {auc_dev * 100:.2f}')
    print(f'{algorithm_name} - AUC on dev is: {auc_test * 100:.2f}')
    print(f'Time to train is {(end_train-start_train):.2f} seconds')
    print(f'Time to predict is {(end_predictions-end_train):.2f} seconds')

In [15]:
train_estimator(estimator=DecisionTreeClassifier, train_file=train_file, dev_file=dev_file)

Starting training
Finished training
Making predictions
Compeleted making predictions
DecisionTreeClassifier - AUC on train is: 57.15
DecisionTreeClassifier - AUC on dev is: 57.14
Time to train is 4.57 minutes
Time to predict is 0.00 minutes


#### Random Forests

In [16]:
train_estimator(estimator=RandomForestClassifier, train_file=train_file, dev_file=dev_file, numTrees=30)

Starting training
Finished training
Making predictions
Compeleted making predictions
RandomForestClassifier - AUC on train is: 70.57
RandomForestClassifier - AUC on dev is: 70.56
Time to train is 6.35 minutes
Time to predict is 0.00 minutes


#### Gradient Boosted Trees

In [None]:
train_estimator(estimator=GBTClassifier, train_file=train_file, dev_file=dev_file, maxIter=10)

Starting training


#### Runnning this with Test Set

In [10]:
def train_estimator(estimator, train_file, dev_file, test_file, labelCol="label", featuresCol="features", *args, **kwargs):
    
    train = sqlContext.read.parquet(train_file)
    
    algorithm_name = estimator.__name__
    
    estimator = estimator(labelCol=labelCol, featuresCol = featuresCol, *args, **kwargs)
    
    print('Starting training')
    start_train = time()
    model = estimator.fit(train)
    end_train = time()
    print('Finished training')
    
    dev = sqlContext.read.parquet(dev_file)
    
    test = sqlContext.read.parquet(test_file)
    
    print('Making predictions')
    train_predictions = model.transform(train)
    dev_predictions = model.transform(dev)
    test_predictions = model.transform(test)
    end_predictions = time()
    print('Compeleted making predictions')
    
    evaluator = BinaryClassificationEvaluator()

    # Calculate the AUC for train and dev
    auc_train = evaluator.evaluate(train_predictions)
    auc_dev = evaluator.evaluate(dev_predictions)
    auc_test = evaluator.evaluate(test_predictions)

    print(f'{algorithm_name} - AUC on train is: {auc_train * 100:.2f}')
    print(f'{algorithm_name} - AUC on dev is: {auc_dev * 100:.2f}')
    print(f'{algorithm_name} - AUC on test is: {auc_test * 100:.2f}')
    print(f'Time to train is {(end_train-start_train):.2f} seconds')
    print(f'Time to predict is {(end_predictions-end_train):.2f} seconds')

#### Decision Trees

In [7]:
train_estimator(estimator=DecisionTreeClassifier, train_file=train_file, dev_file=dev_file, test_file=test_file)

Starting training
Finished training
Making predictions
Compeleted making predictions
DecisionTreeClassifier - AUC on train is: 53.97
DecisionTreeClassifier - AUC on dev is: 53.97
DecisionTreeClassifier - AUC on dev is: 53.95
Time to train is 270.91 seconds
Time to predict is 0.56 seconds


#### Random Forests

In [None]:
train_estimator(estimator=RandomForestClassifier, train_file=train_file, dev_file=dev_file, test_file=test_file, numTrees=30)

Starting training


#### Gradient Boosted Trees

In [None]:
train_estimator(estimator=GBTClassifier, train_file=train_file, dev_file=dev_file, test_file, maxIter=10)