In [1]:
from pyspark.sql import SparkSession 
from pyspark.sql import functions as fn, Row
spark = SparkSession.builder.appName('project_model').getOrCreate()
sc = spark.sparkContext

In [2]:
df = spark.read.format("csv").\
    option("header", "true").\
    option("inferSchema", "true").\
     load("ml_df_test.csv")

In [3]:
training_df, validation_df, testing_df = df.randomSplit([0.6, 0.3, 0.1], seed=0)
print("# points in training: ", training_df.count())
print("# points in validation: ", validation_df.count())
print("# points in testing: ", testing_df.count())

# points in training:  126066
# points in validation:  63009
# points in testing:  21138


In [4]:
variables = df.schema.names
variables.remove('TARGET')
len(variables)

170

In [11]:
from pyspark.ml import feature, regression, Pipeline, classification, pipeline, evaluation
from pyspark import sql
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.feature import PCA, StandardScaler
from pyspark.ml.classification import LogisticRegression

stages_2 = []

assemblerInputs = variables
assembler = VectorAssembler(inputCols = assemblerInputs, outputCol='features')
stages_2 += [assembler]

scaler = StandardScaler(withMean=True, inputCol='features', outputCol='zfeatures')
stages_2 += [scaler]

pca = PCA(k=22, inputCol="zfeatures", outputCol="pcaFeatures")
stages_2+=[pca]

# Create initial LogisticRegression model
lr2 = LogisticRegression(labelCol="TARGET", featuresCol="pcaFeatures", maxIter=100, elasticNetParam=.8)
stages_2 += [lr2]

lr_pipe_2 = Pipeline(stages = stages_2).fit(training_df)

evaluator2 = evaluation.BinaryClassificationEvaluator(labelCol='TARGET')
print("Model 2 AUC Val:", evaluator2.evaluate(lr_pipe_2.transform(validation_df)))
print("Model 2 AUC Test:", evaluator2.evaluate(lr_pipe_2.transform(testing_df)))

Model 2 AUC Val: 0.6957276289469746
Model 2 AUC Test: 0.696638207247368


In [None]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml import feature, regression, Pipeline, classification, pipeline, evaluation
from pyspark import sql
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.feature import PCA, StandardScaler

stages_2 = []

assemblerInputs = variables
assembler = VectorAssembler(inputCols = assemblerInputs, outputCol='features')
stages_2 += [assembler]

scaler = StandardScaler(withMean=True, inputCol='features', outputCol='zfeatures')
stages_2 += [scaler]

pca = PCA(k=22, inputCol="zfeatures", outputCol="pcaFeatures")
stages_2+=[pca]

# Create initial LogisticRegression model
# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="TARGET", featuresCol="pcaFeatures", numTrees=100)
stages_2 += [rf]

lr_pipe_2 = Pipeline(stages = stages_2).fit(training_df)

evaluator2 = evaluation.BinaryClassificationEvaluator(labelCol='TARGET')
print("Model 2 AUC Val:", evaluator2.evaluate(lr_pipe_2.transform(validation_df)))
print("Model 2 AUC Test:", evaluator2.evaluate(lr_pipe_2.transform(testing_df)))

In [None]:
from pyspark.ml import feature, regression, Pipeline, classification, pipeline, evaluation
from pyspark import sql
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.feature import PCA, StandardScaler
from pyspark.ml.classification import GBTClassifier

stages_2 = []

assemblerInputs = variables
assembler = VectorAssembler(inputCols = assemblerInputs, outputCol='features')
stages_2 += [assembler]

scaler = StandardScaler(withMean=True, inputCol='features', outputCol='zfeatures')
stages_2 += [scaler]

pca = PCA(k=22, inputCol="zfeatures", outputCol="pcaFeatures")
stages_2+=[pca]

# Train a GBT model.
gbt = GBTClassifier(labelCol="TARGET", featuresCol="pcaFeatures", maxIter=100, maxBins=50,maxDepth=10)
stages_2+=[gbt]

lr_pipe_2 = Pipeline(stages = stages_2).fit(training_df)

evaluator2 = evaluation.BinaryClassificationEvaluator(labelCol='TARGET')
print("Model 2 AUC Val:", evaluator2.evaluate(lr_pipe_2.transform(validation_df)))
print("Model 2 AUC Test:", evaluator2.evaluate(lr_pipe_2.transform(testing_df)))

In [None]:
# train the model
from pyspark.ml import feature, regression, Pipeline, classification, pipeline, evaluation
from pyspark import sql
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.feature import PCA, StandardScaler
from pyspark.ml.classification import MultilayerPerceptronClassifier

stages_2 = []

assemblerInputs = variables
assembler = VectorAssembler(inputCols = assemblerInputs, outputCol='features')
stages_2 += [assembler]

scaler = StandardScaler(withMean=True, inputCol='features', outputCol='zfeatures')
stages_2 += [scaler]

pca = PCA(k=22, inputCol="zfeatures", outputCol="pcaFeatures")
stages_2+=[pca]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [170, 30, 20, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(featuresCol='pcaFeatures',labelCol='TARGET',maxIter=100,
                                         layers=layers, blockSize=128, seed=1234)
stages_2 += [trainer]

lr_pipe_2 = Pipeline(stages = stages_2).fit(training_df)

evaluator2 = evaluation.BinaryClassificationEvaluator(labelCol='TARGET')
print("Model 2 AUC Val:", evaluator2.evaluate(lr_pipe_2.transform(validation_df)))
print("Model 2 AUC Test:", evaluator2.evaluate(lr_pipe_2.transform(testing_df)))

In [None]:
principal_components = lr_pipe_3.stages[2].pc.toArray()
print(principal_components)
pca=list(zip(variables, 
         principal_components[:, 0], principal_components[:, 1], principal_components[:, 2],
        principal_components[:, 3], principal_components[:, 4], principal_components[:, 5],
        principal_components[:, 6], principal_components[:, 7], principal_components[:, 8],
        principal_components[:, 9]))
import pandas as pd
pca_df = pd.DataFrame(pca)
pca_df.columns = ['feature','pca_0','pca_1','pca_2','pca_3','pca_4','pca_5','pca_6',
                 'pca_7','pca_8','pca_9']