# Resample DataFrame

In [8]:
!pkill -9 java

In [1]:
from code.common import *

In [2]:
import time
import os
import pprint
from pyspark import SparkContext, SQLContext

In [3]:
for file in os.listdir('data'): 
    if 'packed' in file:
        print(file)

train.parquet.normed.filled.masked-60000.encode.picked-987.packed
valid.parquet.normed.filled.masked-60000.encode.picked-987.packed
tests.parquet.normed.filled.masked-60000.encode.picked-987.packed


### Run Logistic Regression with Weighting

In [3]:
from pyspark import SparkContext, SQLContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

In [4]:
train = sqlContext.read.parquet('data/train.parquet.normed.filled.masked-60000.encode.picked-987.packed')

In [5]:
train.take(1)

[Row(label=0, features=SparseVector(1000, {0: 0.0529, 1: -0.0128, 2: 0.0151, 5: 0.0444, 8: -0.1634, 9: 0.1, 10: -0.1923, 14: 1.0, 137: 1.0, 192: 1.0, 202: 1.0, 218: 1.0, 264: 1.0, 267: 1.0, 322: 1.0, 428: 1.0, 444: 1.0, 532: 1.0, 545: 1.0, 695: 1.0, 710: 1.0, 822: 1.0, 827: 1.0, 847: 1.0, 893: 1.0, 902: 1.0, 906: 1.0, 969: 1.0, 974: 1.0, 992: 1.0}), weight=0.2561962930260516)]

In [5]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation     import BinaryClassificationEvaluator

In [7]:
estimator = LogisticRegression(featuresCol='features', labelCol='label', 
                                   maxIter = 10, family = 'binomial',
                              weightCol='weight')
    
start_train = time.time()
model = estimator.fit(train)
end_train = time.time()

In [12]:
dev = sqlContext.read.parquet('data/valid.parquet.normed.filled.masked-60000.encode.picked-987.packed')

# Make Evaluations
start_prediction = time.time()
transformed_train = model.transform(train)
transformed_dev = model.transform(dev)
end_prediction = time.time()

evaluator = BinaryClassificationEvaluator()

auc_train = evaluator.evaluate(transformed_train)
auc_dev = evaluator.evaluate(transformed_dev)

In [13]:
print(f'Random Forests - AUC on train is: {auc_train * 100:.2f}')
print(f'Random Forests - AUC on dev is: {auc_dev * 100:.2f}')
print(f'Training completed in {(end_train - start_train)/60:.2f} minutes')
print(f'Prediction Completed in {(end_prediction - start_prediction)/60:.2f} minutes')
print('\n')

Random Forests - AUC on train is: 73.30
Random Forests - AUC on dev is: 73.29
Training completed in 0.93 minutes
Prediction Completed in 0.00 minutes




### Run Random Forests before resampling

In [None]:
estimator = RandomForestClassifier(featuresCol='features', labelCol='label', 
                                   numTrees = 10)
    
start_train = time.time()
model = estimator.fit(train)
end_train = time.time()

### Resample the Train data

In [4]:
from pyspark import SparkContext, SQLContext
sc = SparkContext.getOrCreate()
sqlContext = SQLContext(sc)

#### Using Masked 100 Train Data

In [6]:
train = sqlContext.read.parquet('data/train.parquet.normed.filled.masked-60000.encode.picked-987.packed')

In [7]:
neg_count = train.filter(train['label']==0).count()
neg_count

27274974

In [8]:
pos_count = train.filter(train['label']==1).count()
pos_count

9394612

In [9]:
total_count = pos_count + neg_count
print(f'Total number of train examples is {total_count}')
print (f'Percentage of negative examples is {neg_count*100/total_count}')
print (f'Percentage of postive examples is {pos_count*100/total_count}')

Total number of train examples is 36669586
Percentage of negative examples is 74.38037069739484
Percentage of postive examples is 25.619629302605162


### Oversampling

In [10]:
# Number of extra data points
extra = neg_count - pos_count
print(f'Extra data points = {extra}')

# Resample rate
resample_rate = extra/pos_count
resample_rate
print(f'The resample rate is {resample_rate}')

Extra data points = 17880362
The resample rate is 1.9032571009851178


In [11]:
extra_positives = train.filter(train['label']==1).sample(withReplacement=True, fraction=1.9)

In [12]:
extra_positives.show(5)

+-----+--------------------+------------------+
|label|            features|            weight|
+-----+--------------------+------------------+
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
|    1|(1000,[0,1,2,4,5,...|0.7438037069739484|
+-----+--------------------+------------------+
only showing top 5 rows



In [13]:
oversampled_train = train.union(extra_positives)
oversampled_train = oversampled_train.sample(withReplacement = False, fraction = 1.0)

In [14]:
sampled_neg_count = oversampled_train.filter(oversampled_train['label']==0).count()
sampled_pos_count = oversampled_train.filter(oversampled_train['label']==1).count()

sampled_total_count = sampled_pos_count + sampled_neg_count
print(f'Total number of train examples is {sampled_total_count}')
print (f'Percentage of negative examples is {sampled_neg_count*100/sampled_total_count}')
print (f'Percentage of postive examples is {sampled_pos_count*100/sampled_total_count}')

Total number of train examples is 54508313
Percentage of negative examples is 50.03819142228819
Percentage of postive examples is 49.96180857771181


In [15]:
oversampled_train.select('features').first().features.size

1000

In [15]:
del extra_positives, train

##### Save the Oversampled df

In [16]:
oversampled_train.write.parquet('data/resampled_data/train.parquet.normed.filled.masked-60000.encode.picked-987.packed.oversampled')

##### Training the model without weighting

In [17]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation     import BinaryClassificationEvaluator

In [22]:
estimator = LogisticRegression(featuresCol='features', labelCol='label', maxIter = 10, regParam = 0.0, family = 'binomial')

In [23]:
evaluator = BinaryClassificationEvaluator()

In [20]:
start_train = time.time()
model = estimator.fit(oversampled_train)
end_train = time.time()

In [24]:
print(f'Time to train is {(end_train-start_train)/60} minutes')

Time to train is 1.3215594331423441 minutes


##### Evaluate the model on Dev Set

In [26]:
dev = sqlContext.read.parquet('data/valid.parquet.normed.filled.masked-60000.encode.picked-987.packed')

In [27]:
# Make Evaluations
start_prediction = time.time()
transformed_train = model.transform(oversampled_train)
transformed_dev = model.transform(dev)
end_prediction = time.time()

In [28]:
print(f'Time to predict is {(end_prediction - start_prediction)/60}')

Time to predict is 0.0013621369997660318


In [29]:
auc_train = evaluator.evaluate(transformed_train)
auc_dev = evaluator.evaluate(transformed_dev)

In [30]:
print(f'Logistic Regression - AUC on train is: {auc_train * 100:.2f}')
print(f'Logistic Regression - AUC on dev is: {auc_dev * 100:.2f}')

Logistic Regression - AUC on train is: 73.31
Logistic Regression - AUC on dev is: 73.29


### Use Regularization and Cross Validation

In [31]:
oversampled_train = sqlContext.read.parquet('data/oversampled_data/train.parquet.normed.filled.masked-60000.encode.picked-987.packed.oversampled')

In [7]:
dev = sqlContext.read.parquet('data/criteo.parquet.df.dev.normed.filled.masked-100.encode.packed')

In [32]:
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation     import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [33]:
estimator = LogisticRegression(featuresCol='features', labelCol='label', maxIter = 10, regParam = 0.01, family = 'binomial')
evaluator = BinaryClassificationEvaluator()

In [34]:
start_train = time.time()
model = estimator.fit(oversampled_train)
end_train = time.time()

In [35]:
regularization_values = [0.01, 0.1, 0.5, 1.0]

start_reg_search = time.time()

for reg_value in regularization_values:
    
    estimator = LogisticRegression(featuresCol='features', labelCol='label', 
                                   maxIter = 10, regParam = reg_value, family = 'binomial')
    
    model = estimator.fit(oversampled_train)
    
    # Make Evaluations

    transformed_train = model.transform(oversampled_train)
    transformed_dev = model.transform(dev)
    end_prediction = time.time()

    # Make Evaluations
    transformed_train = model.transform(oversampled_train)
    transformed_dev = model.transform(dev)


    # Get the AUC
    auc_train = evaluator.evaluate(transformed_train)
    auc_dev = evaluator.evaluate(transformed_dev)

    # Print the AUC
    print(f'Logistic Regression - Regulaization: {reg_value} - AUC on train is: {auc_train * 100:.2f}')
    print(f'Logistic Regression - Regularization: {reg_value} - AUC on dev is: {auc_dev * 100:.2f}')
    
end_reg_search = time.time()

print(f'Completed Regulaization parameter search in {(end_reg_search - start_reg_search)/60} minutes')

Logistic Regression - Regulaization: 0.01 - AUC on train is: 73.28
Logistic Regression - Regularization: 0.01 - AUC on dev is: 73.26
Logistic Regression - Regulaization: 0.1 - AUC on train is: 72.84
Logistic Regression - Regularization: 0.1 - AUC on dev is: 72.83
Logistic Regression - Regulaization: 0.5 - AUC on train is: 71.62
Logistic Regression - Regularization: 0.5 - AUC on dev is: 71.62
Logistic Regression - Regulaization: 1.0 - AUC on train is: 70.95
Logistic Regression - Regularization: 1.0 - AUC on dev is: 70.96
Completed Regulaization parameter search in 7.8684061646461485 minutes


In [18]:
paramGrid = (ParamGridBuilder()
             .addGrid(estimator.regParam, [0.01, 0.1, 1.0])
             #.addGrid(estimator.maxIter, [5, 10])
             .build())

In [None]:
cv = CrossValidator(estimator=estimator, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)

# train using the crossvalidator
start_cv_train = time.time()
cvModel = cv.fit(oversampled_train)
end_cv_train = time.time()

In [None]:
print(f'Training with the cross validator took {(end_cv_train - start_cv_train)/60):.2f} seconds')

In [None]:
# Make Predictions on train and dev
train_predictions = cvModel.transform(oversampled_train)
test_predictions = cvModel.transform(dev)

# Calculate the AUC for train and dev
auc_train = evaluator.evaluate(train_predictions)
auc_dev = evaluator.evaluate(test_predictions)

print(f'Logistic Regression - AUC on train is: {auc_train * 100:.2f}')
print(f'Logistic Regression - AUC on dev is: {auc_dev * 100:.2f}')

#### Using Random Forests

In [36]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

start_train = time.time()
model = rf.fit(oversampled_train)
end_train = time.time()

#evaluator = BinaryClassificationEvaluator()

In [41]:
start_predictions = time.time()
train_predictions = model.transform(oversampled_train)
test_predictions = model.transform(dev)
end_predictions = time.time()

#Calculate the AUC for train and dev
auc_train = evaluator.evaluate(train_predictions)
auc_dev = evaluator.evaluate(test_predictions)

print(f'Random Forests - AUC on train is: {auc_train * 100:.2f}')
print(f'Random Forests - AUC on dev is: {auc_dev * 100:.2f}')
print(f'Time to train the Random Forests model is {(end_train-start_train)/60} minutes')
print(f'Time to run predictions is {(end_predictions-start_predictions)/60:.2f} minutes')

Random Forests - AUC on train is: 70.48
Random Forests - AUC on dev is: 70.47
Time to train the Random Forests model is 5.293456252415975 minutes
Time to run predictions is 0.00 minutes


### Undersampling

In [42]:
train = sqlContext.read.parquet('data/train.parquet.normed.filled.masked-60000.encode.picked-987.packed')

neg_count = train.filter(train['label']==0).count()

pos_count = train.filter(train['label']==1).count()

In [43]:
# Resample rate
resample_rate = pos_count/neg_count
resample_rate
print(f'The resample rate is {resample_rate}')

The resample rate is 0.34444073163919425


In [44]:
new_negatives = train.filter(train['label']==0).sample(withReplacement=False, fraction =0.344)

In [45]:
undersampled_train = train.filter(train['label']==1).union(new_negatives)
undersampled_train = undersampled_train.sample(withReplacement = False, fraction = 1.0)

In [46]:
sampled_neg_count = undersampled_train.filter(train['label']==0).count()
sampled_pos_count = undersampled_train.filter(train['label']==1).count()

sampled_total_count = sampled_pos_count + sampled_neg_count
print(f'Total number of train examples is {sampled_total_count}')
print (f'Percentage of negative examples is {sampled_neg_count*100/sampled_total_count}')
print (f'Percentage of postive examples is {sampled_pos_count*100/sampled_total_count}')

Total number of train examples is 18781446
Percentage of negative examples is 49.979293394129506
Percentage of postive examples is 50.020706605870494


In [48]:
undersampled_train.write.parquet('data/resampled_data/train.parquet.normed.filled.masked-60000.encode.picked-987.packed.undersampled')


In [49]:
undersampled_train.cache()
del train

#### Training the model

In [7]:
#dev = sqlContext.read.parquet('data/criteo.parquet.df.dev.normed.filled.masked-100.encode.packed')

from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

In [50]:
estimator = LogisticRegression(featuresCol='features', labelCol='label', maxIter = 10, regParam = 0.0, family = 'binomial')

evaluator = BinaryClassificationEvaluator()


start_train = time.time()
model = estimator.fit(undersampled_train)
end_train = time.time()

In [52]:
# Make Evaluations
start_prediction = time.time()
transformed_train = model.transform(undersampled_train)
transformed_dev = model.transform(dev)
end_prediction = time.time()

auc_train = evaluator.evaluate(transformed_train)
auc_dev = evaluator.evaluate(transformed_dev)


print(f'Logistic Regression - AUC on train is: {auc_train * 100:.2f}')
print(f'Logistic Regression - AUC on dev is: {auc_dev * 100:.2f}')
print(f'Training completed in {(end_train - start_train)/60:.2f} minutes')
print(f'Prediction Completed in {(end_prediction - start_prediction)/60:.2f} minutes')

Logistic Regression - AUC on train is: 73.32
Logistic Regression - AUC on dev is: 73.30
Training completed in 0.65 minutes
Prediction Completed in 0.02 minutes


#### Random Forests on the undersampled data

In [54]:
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=10)

start_train = time.time()
model = rf.fit(undersampled_train)
end_train = time.time()

In [55]:
# Make Evaluations
start_prediction = time.time()
transformed_train = model.transform(undersampled_train)
transformed_dev = model.transform(dev)
end_prediction = time.time()

auc_train = evaluator.evaluate(transformed_train)
auc_dev = evaluator.evaluate(transformed_dev)


print(f'Logistic Regression - AUC on train is: {auc_train * 100:.2f}')
print(f'Logistic Regression - AUC on dev is: {auc_dev * 100:.2f}')
print(f'Training completed in {(end_train - start_train)/60:.2f} minutes')
print(f'Prediction Completed in {(end_prediction - start_prediction)/60:.2f} minutes')

Logistic Regression - AUC on train is: 70.40
Logistic Regression - AUC on dev is: 70.39
Training completed in 0.93 minutes
Prediction Completed in 0.00 minutes


#### Random Forests find ideal number of trees

In [5]:
oversampled_train = sqlContext.read.parquet('data/resampled_data/train.parquet.normed.filled.masked-60000.encode.picked-987.packed.oversampled')

In [None]:
num_trees_list = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]

for num_trees in num_trees_list:
    
    rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=num_trees)
    
    start_train = time.time()
    model = rf.fit(oversampled_train)
    end_train = time.time()
    
    # Make predictions on Traina and Dev
    start_predictions = time.time()
    train_predictions = model.transform(oversampled_data)
    test_predictions = model.transform(dev)
    end_predictions = time.time()

    # Calculate the AUC for train and dev
    auc_train = evaluator.evaluate(train_predictions)
    auc_dev = evaluator.evaluate(test_predictions)

    print(f'Random Forests - AUC on train is: {auc_train * 100:.2f}')
    print(f'Random Forests - AUC on dev is: {auc_dev * 100:.2f}')
    print(f'Training completed in {(end_train - start_train)/60:.2f} minutes')
    print(f'Prediction Completed in {(end_prediction - start_prediction)/60:.2f} minutes')
    print('\n')