In [48]:
import pyspark
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("NLP Practice").getOrCreate()
spark

In [49]:
from pyspark.ml.feature import *
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.ml.classification import *
from pyspark.ml.evaluation import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# For pipeline development
from pyspark.ml import Pipeline

In [50]:
path = "/home/randi/Documents/pyspark_udemy/Jupyter+Notebooks+and+Datasets+AS+of+16MAY21/Jupyter_Notebooks_and_Datasets_AS_of_16MAY21/Practice/Datasets/"

df = spark.read.csv(path+'fake_job_postings.csv', inferSchema = True, header=True)

In [51]:
df.limit(4).toPandas()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...","Food52, a fast-growing, James Beard Award-winn...",Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,2,Customer Service - Cloud Video Production,"NZ, , Auckland",Success,,"90 Seconds, the worlds Cloud Video Production ...",Organised - Focused - Vibrant - Awesome!Do you...,What we expect from you:Your key responsibilit...,What you will get from usThrough being part of...,0,1,0,Full-time,Not Applicable,,Marketing and Advertising,Customer Service,0
2,3,Commissioning Machinery Assistant (CMA),"US, IA, Wever",,,Valor Services provides Workforce Solutions th...,"Our client, located in Houston, is actively se...",Implement pre-commissioning and commissioning ...,,0,1,0,,,,,,0
3,4,Account Executive - Washington DC,"US, DC, Washington",Sales,,Our passion for improving quality of life thro...,THE COMPANY: ESRI – Environmental Systems Rese...,"EDUCATION: Bachelor’s or Master’s in GIS, busi...",Our culture is anything but corporate—we have ...,0,1,0,Full-time,Mid-Senior level,Bachelor's Degree,Computer Software,Sales,0


In [52]:
df.filter("fraudulent=1").show(1, False)

+------+-----------------+---------------+----------+------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [53]:
df.printSchema()

root
 |-- job_id: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- location: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary_range: string (nullable = true)
 |-- company_profile: string (nullable = true)
 |-- description: string (nullable = true)
 |-- requirements: string (nullable = true)
 |-- benefits: string (nullable = true)
 |-- telecommuting: string (nullable = true)
 |-- has_company_logo: string (nullable = true)
 |-- has_questions: string (nullable = true)
 |-- employment_type: string (nullable = true)
 |-- required_experience: string (nullable = true)
 |-- required_education: string (nullable = true)
 |-- industry: string (nullable = true)
 |-- function: string (nullable = true)
 |-- fraudulent: string (nullable = true)



In [54]:
df.count()

17880

In [55]:
def null_value_calc(df):
    null_columns_counts = []
    numRows = df.count()
    for k in df.columns:
        nullRows = df.where(col(k).isNull()).count()
        if(nullRows>0):
            temp = k,nullRows,(nullRows/numRows)*100
            null_columns_counts.append(temp)
    return(null_columns_counts)

null_columns_calc_list = null_value_calc(df)
spark.createDataFrame(null_columns_calc_list, ['Column_Name', 'Null_Values_Count','Null_Value_Percent']).show()


+-------------------+-----------------+--------------------+
|        Column_Name|Null_Values_Count|  Null_Value_Percent|
+-------------------+-----------------+--------------------+
|           location|              346|  1.9351230425055927|
|         department|            11547|   64.58053691275167|
|       salary_range|            15011|   83.95413870246085|
|    company_profile|             3308|  18.501118568232663|
|        description|                1|0.005592841163310962|
|       requirements|             2573|  14.390380313199106|
|           benefits|             6966|   38.95973154362416|
|      telecommuting|               89| 0.49776286353467564|
|   has_company_logo|               29|  0.1621923937360179|
|      has_questions|               30| 0.16778523489932887|
|    employment_type|             3292|   18.41163310961969|
|required_experience|             6723|  37.600671140939596|
| required_education|             7748|  43.333333333333336|
|           industry|   

In [56]:
og_len = df.count()
drop_len = df.na.drop().count()
print("Total Null Rows:", og_len-drop_len)
print("Percentage Null Rows:", (og_len-drop_len)/og_len)

Total Null Rows: 17094
Percentage Null Rows: 0.9560402684563758


In [57]:
#df = df.select("description","fraudulent")
#df.show(1, False)
df = df.na.drop(subset=["fraudulent","description"])

In [58]:
df.count()

17704

In [59]:
df.groupBy("fraudulent").count().orderBy(col("count").desc()).show()

+--------------------+-----+
|          fraudulent|count|
+--------------------+-----+
|                   0|16080|
|                   1|  886|
|           Full-time|   73|
|Hospital & Health...|   55|
|   Bachelor's Degree|   53|
|         Engineering|   26|
| perform quality ...|   17|
|    Mid-Senior level|   15|
|         Unspecified|   15|
|           Associate|   14|
|               Sales|   14|
| passionate about...|   13|
|Information Techn...|   13|
|           Marketing|   13|
|            Internet|   12|
|   Computer Software|   12|
|We offer an excel...|   11|
|      Not Applicable|   11|
| además con el fi...|   10|
|    Customer Service|    7|
+--------------------+-----+
only showing top 20 rows



In [60]:
df = df.filter("fraudulent IN(0,1)")

In [61]:
df.groupBy("fraudulent").count().orderBy(col("count").desc()).show()

+----------+-----+
|fraudulent|count|
+----------+-----+
|         0|16080|
|         1|  886|
+----------+-----+



In [62]:
df = df.sampleBy("fraudulent", fractions={"0":0.4, "1":1.0}, seed = 10)
df.groupBy("fraudulent").count().orderBy(col("count").desc()).show()

+----------+-----+
|fraudulent|count|
+----------+-----+
|         0| 6323|
|         1|  886|
+----------+-----+



In [63]:
df.select("description").show(5, False)

+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [64]:
df = df.withColumn("description", regexp_replace(col("description"), "[^A-Za-z ]+", ""))

In [65]:
df = df.withColumn("description", regexp_replace(col("description"), " +", " "))

In [66]:
df = df.withColumn("description", lower(col("description")))

In [67]:
df.limit(5).toPandas()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,has_company_logo,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...",food a fastgrowing james beard awardwinning on...,Experience with content management systems a m...,,0,1,0,Other,Internship,,,Marketing,0
1,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,job title itemization review managerlocation f...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,1,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0
2,6,Accounting Clerk,"US, MD,",,,,job overviewapex is an environmental consultin...,,,0,0,0,,,,,,0
3,10,Customer Service Associate - Part Time,"US, AZ, Phoenix",,,"Novitex Enterprise Solutions, formerly Pitney ...",the customer service associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,,0,1,0,Part-time,Entry level,High School or equivalent,Financial Services,Customer Service,0
4,12,Talent Sourcer (6 months fixed-term contract),"GB, LND, London",HR,,Want to build a 21st century financial service...,transferwise is the clever new way to move mon...,We’re looking for someone who:Proven track rec...,You will join one of Europe’s most hotly tippe...,0,1,0,,,,,,0


In [68]:
regex_tokenizer = RegexTokenizer(inputCol="description", outputCol = "words", pattern = "\\W")
remover = StopWordsRemover(inputCol=regex_tokenizer.getOutputCol(), outputCol = "filtered")
indexer = StringIndexer(inputCol = "fraudulent", outputCol="label")

pipeline = Pipeline(stages = [regex_tokenizer, remover, indexer])
data_prep_pl = pipeline.fit(df)

feature_data = data_prep_pl.transform(df)

feature_data.show(1, False)

+------+----------------+----------------+----------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------

In [69]:
feature_data.limit(5).toPandas()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,has_questions,employment_type,required_experience,required_education,industry,function,fraudulent,words,filtered,label
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...",food a fastgrowing james beard awardwinning on...,Experience with content management systems a m...,,0,...,0,Other,Internship,,,Marketing,0,"[food, a, fastgrowing, james, beard, awardwinn...","[food, fastgrowing, james, beard, awardwinning...",0.0
1,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,job title itemization review managerlocation f...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,...,1,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,"[job, title, itemization, review, managerlocat...","[job, title, itemization, review, managerlocat...",0.0
2,6,Accounting Clerk,"US, MD,",,,,job overviewapex is an environmental consultin...,,,0,...,0,,,,,,0,"[job, overviewapex, is, an, environmental, con...","[job, overviewapex, environmental, consulting,...",0.0
3,10,Customer Service Associate - Part Time,"US, AZ, Phoenix",,,"Novitex Enterprise Solutions, formerly Pitney ...",the customer service associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,,0,...,0,Part-time,Entry level,High School or equivalent,Financial Services,Customer Service,0,"[the, customer, service, associate, will, be, ...","[customer, service, associate, based, phoenix,...",0.0
4,12,Talent Sourcer (6 months fixed-term contract),"GB, LND, London",HR,,Want to build a 21st century financial service...,transferwise is the clever new way to move mon...,We’re looking for someone who:Proven track rec...,You will join one of Europe’s most hotly tippe...,0,...,0,,,,,,0,"[transferwise, is, the, clever, new, way, to, ...","[transferwise, clever, new, way, move, money, ...",0.0


In [70]:
hashingTF = HashingTF(inputCol = 'filtered', outputCol='rawfeatures', numFeatures=20)
HTFfeaturizedData = hashingTF.transform(feature_data)

HTFfeaturizedData.show(1, False)

+------+----------------+----------------+----------+------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------

In [71]:
idf = IDF(inputCol='rawfeatures', outputCol='features')
idfModel = idf.fit(HTFfeaturizedData)
TFIDFfeaturizedData = idfModel.transform(HTFfeaturizedData)

TFIDFfeaturizedData.limit(4).toPandas()

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,required_experience,required_education,industry,function,fraudulent,words,filtered,label,rawfeatures,features
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...",food a fastgrowing james beard awardwinning on...,Experience with content management systems a m...,,0,...,Internship,,,Marketing,0,"[food, a, fastgrowing, james, beard, awardwinn...","[food, fastgrowing, james, beard, awardwinning...",0.0,"(6.0, 6.0, 8.0, 2.0, 4.0, 4.0, 4.0, 5.0, 5.0, ...","(0.5140544507846239, 0.6469756304009069, 0.413..."
1,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,job title itemization review managerlocation f...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,...,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,"[job, title, itemization, review, managerlocat...","[job, title, itemization, review, managerlocat...",0.0,"(9.0, 4.0, 9.0, 18.0, 7.0, 4.0, 3.0, 6.0, 5.0,...","(0.7710816761769358, 0.43131708693393794, 0.46..."
2,6,Accounting Clerk,"US, MD,",,,,job overviewapex is an environmental consultin...,,,0,...,,,,,0,"[job, overviewapex, is, an, environmental, con...","[job, overviewapex, environmental, consulting,...",0.0,"(10.0, 4.0, 24.0, 24.0, 18.0, 10.0, 11.0, 22.0...","(0.8567574179743731, 0.43131708693393794, 1.23..."
3,10,Customer Service Associate - Part Time,"US, AZ, Phoenix",,,"Novitex Enterprise Solutions, formerly Pitney ...",the customer service associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,,0,...,Entry level,High School or equivalent,Financial Services,Customer Service,0,"[the, customer, service, associate, will, be, ...","[customer, service, associate, based, phoenix,...",0.0,"(8.0, 4.0, 8.0, 1.0, 2.0, 5.0, 6.0, 4.0, 9.0, ...","(0.6854059343794985, 0.43131708693393794, 0.41..."


In [72]:
HTFfeaturizedData = HTFfeaturizedData.withColumnRenamed("rawfeatures", "features")

In [73]:
Word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol='filtered', outputCol='features')
model = Word2Vec.fit(feature_data)
W2VfeaturizedData = model.transform(feature_data)

W2VfeaturizedData.limit(4).toPandas()

                                                                                

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,employment_type,required_experience,required_education,industry,function,fraudulent,words,filtered,label,features
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...",food a fastgrowing james beard awardwinning on...,Experience with content management systems a m...,,0,...,Other,Internship,,,Marketing,0,"[food, a, fastgrowing, james, beard, awardwinn...","[food, fastgrowing, james, beard, awardwinning...",0.0,"[-0.03416690329396272, 0.19158863980040142, 0...."
1,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,job title itemization review managerlocation f...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,...,Full-time,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,"[job, title, itemization, review, managerlocat...","[job, title, itemization, review, managerlocat...",0.0,"[-0.14776798056029694, 0.36256654325842375, -0..."
2,6,Accounting Clerk,"US, MD,",,,,job overviewapex is an environmental consultin...,,,0,...,,,,,,0,"[job, overviewapex, is, an, environmental, con...","[job, overviewapex, environmental, consulting,...",0.0,"[0.14494441262012536, 0.23038675352949606, 0.1..."
3,10,Customer Service Associate - Part Time,"US, AZ, Phoenix",,,"Novitex Enterprise Solutions, formerly Pitney ...",the customer service associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,,0,...,Part-time,Entry level,High School or equivalent,Financial Services,Customer Service,0,"[the, customer, service, associate, will, be, ...","[customer, service, associate, based, phoenix,...",0.0,"[0.10557597251165482, -0.0541582547349944, -1...."


In [74]:
scaler = MinMaxScaler(inputCol="features", outputCol="scaledFeatures")
scalerModel = scaler.fit(W2VfeaturizedData)
scaled_data = scalerModel.transform(W2VfeaturizedData)
scaled_data.limit(4).toPandas()

                                                                                

Unnamed: 0,job_id,title,location,department,salary_range,company_profile,description,requirements,benefits,telecommuting,...,required_experience,required_education,industry,function,fraudulent,words,filtered,label,features,scaledFeatures
0,1,Marketing Intern,"US, NY, New York",Marketing,,"We're Food52, and we've created a groundbreaki...",food a fastgrowing james beard awardwinning on...,Experience with content management systems a m...,,0,...,Internship,,,Marketing,0,"[food, a, fastgrowing, james, beard, awardwinn...","[food, fastgrowing, james, beard, awardwinning...",0.0,"[-0.03416690329396272, 0.19158863980040142, 0....","[0.7918278264473418, 0.7151323094426304, 0.614..."
1,5,Bill Review Manager,"US, FL, Fort Worth",,,SpotSource Solutions LLC is a Global Human Cap...,job title itemization review managerlocation f...,QUALIFICATIONS:RN license in the State of Texa...,Full Benefits Offered,0,...,Mid-Senior level,Bachelor's Degree,Hospital & Health Care,Health Care Provider,0,"[job, title, itemization, review, managerlocat...","[job, title, itemization, review, managerlocat...",0.0,"[-0.14776798056029694, 0.36256654325842375, -0...","[0.7590790529154867, 0.7963977606180913, 0.550..."
2,6,Accounting Clerk,"US, MD,",,,,job overviewapex is an environmental consultin...,,,0,...,,,,,0,"[job, overviewapex, is, an, environmental, con...","[job, overviewapex, environmental, consulting,...",0.0,"[0.14494441262012536, 0.23038675352949606, 0.1...","[0.8434618079465764, 0.7335729743074082, 0.656..."
3,10,Customer Service Associate - Part Time,"US, AZ, Phoenix",,,"Novitex Enterprise Solutions, formerly Pitney ...",the customer service associate will be based i...,Minimum Requirements:Minimum of 6 months custo...,,0,...,Entry level,High School or equivalent,Financial Services,Customer Service,0,"[the, customer, service, associate, will, be, ...","[customer, service, associate, based, phoenix,...",0.0,"[0.10557597251165482, -0.0541582547349944, -1....","[0.8321127242896313, 0.5983293082624782, 0.004..."


In [75]:
W2VfeaturizedData = scaled_data.select("description","fraudulent", "label", "scaledFeatures")
W2VfeaturizedData = W2VfeaturizedData.withColumnRenamed("scaledFeatures", "features")

In [76]:
W2VfeaturizedData.limit(3).toPandas()

Unnamed: 0,description,fraudulent,label,features
0,food a fastgrowing james beard awardwinning on...,0,0.0,"[0.7918278264473418, 0.7151323094426304, 0.614..."
1,job title itemization review managerlocation f...,0,0.0,"[0.7590790529154867, 0.7963977606180913, 0.550..."
2,job overviewapex is an environmental consultin...,0,0.0,"[0.8434618079465764, 0.7335729743074082, 0.656..."


In [77]:
HTFfeaturizedData.name = 'HTFfeaturizedData'
TFIDFfeaturizedData.name = 'TFIDFfeaturizedData'
W2VfeaturizedData.name = "W2VfeaturizedData"

In [78]:
def ClassTrainEval(classifier,features,classes,train,test):

    def FindMtype(classifier):
        # Intstantiate Model
        M = classifier
        # Learn what it is
        Mtype = type(M).__name__
        
        return Mtype
    
    Mtype = FindMtype(classifier)
    

    def IntanceFitModel(Mtype,classifier,classes,features,train):
        
        if Mtype == "OneVsRest":
            # instantiate the base classifier.
            lr = LogisticRegression()
            # instantiate the One Vs Rest Classifier.
            OVRclassifier = OneVsRest(classifier=lr)
#             fitModel = OVRclassifier.fit(train)
            # Add parameters of your choice here:
            paramGrid = ParamGridBuilder() \
                .addGrid(lr.regParam, [0.1, 0.01]) \
                .build()
            #Cross Validator requires the following parameters:
            crossval = CrossValidator(estimator=OVRclassifier,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=MulticlassClassificationEvaluator(),
                                      numFolds=2) # 3 is best practice
            # Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
        if Mtype == "MultilayerPerceptronClassifier":
            # specify layers for the neural network:
            # input layer of size features, two intermediate of features+1 and same size as features
            # and output of size number of classes
            # Note: crossvalidator cannot be used here
            features_count = len(features[0][0])
            layers = [features_count, features_count+1, features_count, classes]
            MPC_classifier = MultilayerPerceptronClassifier(maxIter=100, layers=layers, blockSize=128, seed=1234)
            fitModel = MPC_classifier.fit(train)
            return fitModel
        if Mtype in("LinearSVC","GBTClassifier") and classes != 2: # These classifiers currently only accept binary classification
            print(Mtype," could not be used because PySpark currently only accepts binary classification data for this algorithm")
            return
        if Mtype in("LogisticRegression","NaiveBayes","RandomForestClassifier","GBTClassifier","LinearSVC","DecisionTreeClassifier"):
  
            # Add parameters of your choice here:
            if Mtype in("LogisticRegression"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.regParam, [0.1, 0.01]) \
                             .addGrid(classifier.maxIter, [10, 15,20])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("NaiveBayes"):
                paramGrid = (ParamGridBuilder() \
                             .addGrid(classifier.smoothing, [0.0, 0.2, 0.4, 0.6]) \
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("RandomForestClassifier"):
                paramGrid = (ParamGridBuilder() \
                               .addGrid(classifier.maxDepth, [2, 5, 10])
#                                .addGrid(classifier.maxBins, [5, 10, 20])
#                                .addGrid(classifier.numTrees, [5, 20, 50])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("GBTClassifier"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
#                              .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .addGrid(classifier.maxIter, [10, 15,50,100])
                             .build())
                
            # Add parameters of your choice here:
            if Mtype in("LinearSVC"):
                paramGrid = (ParamGridBuilder() \
                             .addGrid(classifier.maxIter, [10, 15]) \
                             .addGrid(classifier.regParam, [0.1, 0.01]) \
                             .build())
            
            # Add parameters of your choice here:
            if Mtype in("DecisionTreeClassifier"):
                paramGrid = (ParamGridBuilder() \
#                              .addGrid(classifier.maxDepth, [2, 5, 10, 20, 30]) \
                             .addGrid(classifier.maxBins, [10, 20, 40, 80, 100]) \
                             .build())
            
            #Cross Validator requires all of the following parameters:
            crossval = CrossValidator(estimator=classifier,
                                      estimatorParamMaps=paramGrid,
                                      evaluator=MulticlassClassificationEvaluator(),
                                      numFolds=2) # 3 + is best practice
            # Fit Model: Run cross-validation, and choose the best set of parameters.
            fitModel = crossval.fit(train)
            return fitModel
    
    fitModel = IntanceFitModel(Mtype,classifier,classes,features,train)
    
    # Print feature selection metrics
    if fitModel is not None:
        
        if Mtype in("OneVsRest"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype + '\033[0m')
            # Extract list of binary models
            models = BestModel.models
            for model in models:
                print('\033[1m' + 'Intercept: '+ '\033[0m',model.intercept,'\033[1m' + '\nCoefficients:'+ '\033[0m',model.coefficients)

        if Mtype == "MultilayerPerceptronClassifier":
            print("")
            print('\033[1m' + Mtype," Weights"+ '\033[0m')
            print('\033[1m' + "Model Weights: "+ '\033[0m',fitModel.weights.size)
            print("")

        if Mtype in("DecisionTreeClassifier", "GBTClassifier","RandomForestClassifier"):
            # FEATURE IMPORTANCES
            # Estimate of the importance of each feature.
            # Each feature’s importance is the average of its importance across all trees 
            # in the ensemble The importance vector is normalized to sum to 1. 
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Feature Importances"+ '\033[0m')
            print("(Scores add up to 1)")
            print("Lowest score is the least important")
            print(" ")
            print(BestModel.featureImportances)
            
            if Mtype in("DecisionTreeClassifier"):
                global DT_featureimportances
                DT_featureimportances = BestModel.featureImportances.toArray()
                global DT_BestModel
                DT_BestModel = BestModel
            if Mtype in("GBTClassifier"):
                global GBT_featureimportances
                GBT_featureimportances = BestModel.featureImportances.toArray()
                global GBT_BestModel
                GBT_BestModel = BestModel
            if Mtype in("RandomForestClassifier"):
                global RF_featureimportances
                RF_featureimportances = BestModel.featureImportances.toArray()
                global RF_BestModel
                RF_BestModel = BestModel

        if Mtype in("LogisticRegression"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Coefficient Matrix"+ '\033[0m')
            print("You should compares these relative to eachother")
            print("Coefficients: \n" + str(BestModel.coefficientMatrix))
            print("Intercept: " + str(BestModel.interceptVector))
            global LR_coefficients
            LR_coefficients = BestModel.coefficientMatrix.toArray()
            global LR_BestModel
            LR_BestModel = BestModel

        if Mtype in("LinearSVC"):
            # Get Best Model
            BestModel = fitModel.bestModel
            print(" ")
            print('\033[1m' + Mtype," Coefficients"+ '\033[0m')
            print("You should compares these relative to eachother")
            print("Coefficients: \n" + str(BestModel.coefficients))
            global LSVC_coefficients
            LSVC_coefficients = BestModel.coefficients.toArray()
            global LSVC_BestModel
            LSVC_BestModel = BestModel
        
   
    # Set the column names to match the external results dataframe that we will join with later:
    columns = ['Classifier', 'Result']
    
    if Mtype in("LinearSVC","GBTClassifier") and classes != 2:
        Mtype = [Mtype] # make this a list
        score = ["N/A"]
        result = spark.createDataFrame(zip(Mtype,score), schema=columns)
    else:
        predictions = fitModel.transform(test)
        MC_evaluator = MulticlassClassificationEvaluator(metricName="accuracy") # redictionCol="prediction",
        accuracy = (MC_evaluator.evaluate(predictions))*100
        Mtype = [Mtype] # make this a string
        score = [str(accuracy)] #make this a string and convert to a list
        result = spark.createDataFrame(zip(Mtype,score), schema=columns)
        result = result.withColumn('Result',result.Result.substr(0, 5))
        
    return result
    #Also returns the fit model important scores or p values

In [79]:
# from pyspark.ml.classification import *
# from pyspark.ml.evaluation import *
# from pyspark.sql import functions
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

# Comment out Naive Bayes if your data still contains negative values
classifiers = [
                LogisticRegression()
                ,OneVsRest()
               ,LinearSVC()
               ,NaiveBayes()
               ,RandomForestClassifier()
               ,GBTClassifier()
               ,DecisionTreeClassifier()
               ,MultilayerPerceptronClassifier()
              ] 

featureDF_list = [HTFfeaturizedData,TFIDFfeaturizedData,W2VfeaturizedData]

In [80]:
for featureDF in featureDF_list:
    print(featureDF.name)
    train, test = featureDF.randomSplit([0.7,0.3], seed=11)

    features = featureDF.select(['features']).collect()
    class_count = featureDF.select(countDistinct("label")).collect()
    classes = class_count[0][0]

    columns = ['Classifier','Result']
    vals = [("Place Holder", "N/A")]
    results = spark.createDataFrame(vals, columns)

    for classifier in classifiers:
        new_result = ClassTrainEval(classifier, features, classes, train, test)
        results = results.union(new_result)
    results = results.where("Classifier!='Place Holder'")
    print(results.show(truncate=False))

HTFfeaturizedData


                                                                                

 
[1mLogisticRegression  Coefficient Matrix[0m
You should compares these relative to eachother
Coefficients: 
DenseMatrix([[-0.02084135, -0.04647101,  0.00528349,  0.04161692, -0.00721138,
              -0.00366765,  0.08073108, -0.01817296, -0.0345073 , -0.00674497,
               0.04157756, -0.06228243, -0.06560296, -0.01242676, -0.03218051,
               0.0797224 ,  0.00129737, -0.02077709,  0.05564467, -0.00402408]])
Intercept: [-1.7538739056298198]


                                                                                

 
[1mOneVsRest[0m
[1mIntercept: [0m 1.7673949621788079 [1m
Coefficients:[0m [0.013699596001929845,0.03755394320856413,-0.00462593218446697,-0.03147622716879421,0.005101493774547054,0.005126544029019827,-0.0639974596152406,0.015410993392439851,0.025565327050193622,0.0054473139872066876,-0.0300303486520928,0.049126269247389234,0.048413640562864194,0.008049224970391615,0.027620729129808022,-0.06299882273386277,0.0009178213634422868,0.016836208364222085,-0.043130006238049995,0.002684962218062481]
[1mIntercept: [0m -1.7673949621632732 [1m
Coefficients:[0m [-0.013699596007044582,-0.03755394321013064,0.004625932186119564,0.03147622716546956,-0.005101493786944669,-0.005126544025416942,0.06399745962515363,-0.015410993390901752,-0.02556532705838482,-0.005447313981212244,0.03003034865751989,-0.049126269247860045,-0.04841364057525717,-0.008049224978918175,-0.027620729127103713,0.06299882275216419,-0.000917821367479112,-0.0168362083582626,0.04313000623779297,-0.002684962220644138]


                                                                                

 
[1mLinearSVC  Coefficients[0m
You should compares these relative to eachother
Coefficients: 
[-0.0009364279131072919,-0.002188197116421925,-0.0004688209052870512,0.0021319078724470596,-0.0007381186382698889,0.0,0.006153152018787303,-0.0018894398823381567,-0.0022515257801326025,-0.0006627715779285882,0.0002856478820824608,-0.002339193136432494,-0.0048573439571895595,-0.0009169301540093623,-0.001167481098443248,0.0047825000172187545,-0.00023310913804416297,-0.0013679016541576282,0.0026932745010846437,-0.0008042571915765041]


                                                                                

 
[1mRandomForestClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.06613042512617784,0.04554856645813327,0.04719591687905851,0.04140765443892019,0.034164432874391845,0.042871097910953684,0.06420658048650599,0.04686836957596832,0.05540390641663283,0.05090374929963417,0.05375084590879853,0.05101024042835781,0.06062667383624228,0.0496448940342742,0.05223915749530694,0.05252174969723981,0.04588961324156382,0.05333072241519955,0.04791405599142485,0.03837134748521561])


                                                                                

 
[1mGBTClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.061852117682657186,0.05646359791559172,0.054984625282304016,0.054444630977027245,0.05155710766257805,0.04584960779191782,0.0407729632370736,0.04691462371224818,0.05839682980358466,0.03370322316309605,0.059457128899880786,0.0408023574834652,0.058872672513295275,0.06658470539856765,0.04312828366850234,0.05770503959909279,0.031165564155839424,0.054886326764443244,0.0358415129212004,0.04661708136763444])
 
[1mDecisionTreeClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,5,6,7,9,10,11,12,13,17,18,19],[0.01219252435063646,0.010973271915572819,0.02608308547888155,0.06641598244412432,0.04906178458796776,0.09266576422763101,0.059419720309872026,0.0573583186594524,0.18542535864891077,0.09312826951256573,0.07997182189770888,0.05853368926386936,0.03577652008412855,0.0755165351711

                                                                                

23/01/27 09:16:37 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
23/01/27 09:16:37 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS

[1mMultilayerPerceptronClassifier  Weights[0m
[1mModel Weights: [0m 923

+------------------------------+------+
|Classifier                    |Result|
+------------------------------+------+
|LogisticRegression            |88.81 |
|OneVsRest                     |88.86 |
|LinearSVC                     |88.90 |
|NaiveBayes                    |88.44 |
|RandomForestClassifier        |91.97 |
|GBTClassifier                 |91.87 |
|DecisionTreeClassifier        |89.28 |
|MultilayerPerceptronClassifier|89.46 |
+------------------------------+------+

None
TFIDFfeaturizedData


                                                                                

 
[1mLogisticRegression  Coefficient Matrix[0m
You should compares these relative to eachother
Coefficients: 
DenseMatrix([[-0.24325844, -0.43096839,  0.10227758,  0.39384314, -0.06556127,
              -0.04469701,  0.8670754 , -0.32004126, -0.3521491 , -0.05515062,
               0.63494538, -0.64669185, -1.31449645, -0.23075988, -0.26792781,
               0.96448225,  0.01336484, -0.29772685,  0.37715412, -0.02591433]])
Intercept: [-1.7538739056298192]


                                                                                

 
[1mOneVsRest[0m
[1mIntercept: [0m 1.7673949621648837 [1m
Coefficients:[0m [0.1599005239195457,0.34827225118179705,-0.08954858374732327,-0.2978763267708491,0.04637955862392531,0.06247629368217365,-0.6873514098113032,0.2714006345877066,0.260895691126102,0.04454027758584908,-0.4586039288753766,0.5100885733357495,0.9700714638723311,0.14947078071953426,0.2299640491058222,-0.7621602426435224,0.009454943209947008,0.24125572626618685,-0.2923309432059009,0.01729065506650216]
[1mIntercept: [0m -1.7673949621780003 [1m
Coefficients:[0m [-0.15990052397256985,-0.3482722511702772,0.08954858375031641,0.29787632678385406,-0.046379558670859015,-0.06247629364508799,0.6873514098458156,-0.27140063458999036,-0.26089569116992006,-0.044540277607035765,0.4586039288745568,-0.5100885733426728,-0.9700714639418248,-0.1494707807468574,-0.22996404910165322,0.7621602428268337,-0.009454943273051927,-0.24125572628190456,0.29233094322216435,-0.01729065507845685]


                                                                                

 
[1mLinearSVC  Coefficients[0m
You should compares these relative to eachother
Coefficients: 
[-0.010929907269682336,-0.020293164195994206,-0.009075413651003499,0.02017538133441982,-0.006710508365455348,0.0,0.0660866500052867,-0.03327463519343255,-0.022976955207577456,-0.005419190108458824,0.004362228441219143,-0.0242883432429562,-0.09732733808330131,-0.01702701396190561,-0.009720188029660452,0.057858722046219654,-0.0024013754269523718,-0.019601450632380883,0.018254749857899363,-0.005179266059204237]


                                                                                

 
[1mRandomForestClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.06613042512617784,0.04554856645813327,0.04719591687905851,0.04140765443892019,0.034164432874391845,0.042871097910953684,0.06420658048650599,0.04686836957596832,0.05540390641663283,0.05090374929963417,0.05375084590879853,0.05101024042835781,0.06062667383624228,0.0496448940342742,0.05223915749530694,0.05252174969723981,0.04588961324156382,0.05333072241519955,0.04791405599142485,0.03837134748521561])


                                                                                

 
[1mGBTClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.061852117682657186,0.05646359791559172,0.054984625282304016,0.054444630977027245,0.05155710766257805,0.04584960779191782,0.0407729632370736,0.04691462371224818,0.05839682980358466,0.03370322316309605,0.059457128899880786,0.0408023574834652,0.058872672513295275,0.06658470539856765,0.04312828366850234,0.05770503959909279,0.031165564155839424,0.054886326764443244,0.0358415129212004,0.04661708136763444])


                                                                                

 
[1mDecisionTreeClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,5,6,7,9,10,11,12,13,17,18,19],[0.01219252435063646,0.010973271915572819,0.02608308547888155,0.06641598244412432,0.04906178458796776,0.09266576422763101,0.059419720309872026,0.0573583186594524,0.18542535864891077,0.09312826951256573,0.07997182189770888,0.05853368926386936,0.03577652008412855,0.07551653517110311,0.09747735344757529])


                                                                                


[1mMultilayerPerceptronClassifier  Weights[0m
[1mModel Weights: [0m 923



                                                                                

+------------------------------+------+
|Classifier                    |Result|
+------------------------------+------+
|LogisticRegression            |88.81 |
|OneVsRest                     |88.86 |
|LinearSVC                     |88.90 |
|NaiveBayes                    |88.90 |
|RandomForestClassifier        |91.97 |
|GBTClassifier                 |91.87 |
|DecisionTreeClassifier        |89.23 |
|MultilayerPerceptronClassifier|89.37 |
+------------------------------+------+

None
W2VfeaturizedData


                                                                                

 
[1mLogisticRegression  Coefficient Matrix[0m
You should compares these relative to eachother
Coefficients: 
DenseMatrix([[ 3.85934837, -1.41808566, -1.12117483]])

Intercept: [-3.3550912783378832]


                                                                                

 
[1mOneVsRest[0m
[1mIntercept: [0m 2.3768682659007454 [1m
Coefficients:[0m [-1.0918787936125087,0.1980821336818687,0.5022155208565903]
[1mIntercept: [0m -2.3768682659007427 [1m
Coefficients:[0m [1.0918787936125092,-0.19808213368187286,-0.5022155208565906]


                                                                                

 
[1mLinearSVC  Coefficients[0m
You should compares these relative to eachother
Coefficients: 
[0.0753034072084865,0.04108527323041721,-0.0]


                                                                                

 
[1mRandomForestClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(3,[0,1,2],[0.3922397532530658,0.28145060980316217,0.326309636943772])


                                                                                

 
[1mGBTClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(3,[0,1,2],[0.3710975763811599,0.3402639322645396,0.2886384913543005])


                                                                                

 
[1mDecisionTreeClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(3,[0,1,2],[0.5230415693356344,0.27846937738997785,0.1984890532743877])


                                                                                

23/01/27 09:23:35 ERROR StrongWolfeLineSearch: Encountered bad values in function evaluation. Decreasing step size to 0.5
23/01/27 09:23:35 ERROR StrongWolfeLineSearch: Encountered bad values in function evaluation. Decreasing step size to 0.5
23/01/27 09:23:35 ERROR StrongWolfeLineSearch: Encountered bad values in function evaluation. Decreasing step size to 0.25
23/01/27 09:23:35 ERROR StrongWolfeLineSearch: Encountered bad values in function evaluation. Decreasing step size to 0.125
23/01/27 09:23:35 ERROR StrongWolfeLineSearch: Encountered bad values in function evaluation. Decreasing step size to 0.0625
23/01/27 09:23:35 ERROR StrongWolfeLineSearch: Encountered bad values in function evaluation. Decreasing step size to 0.03125
23/01/27 09:23:35 ERROR StrongWolfeLineSearch: Encountered bad values in function evaluation. Decreasing step size to 0.03515625
23/01/27 09:23:35 ERROR LBFGS: Failure! Resetting history: breeze.optimize.FirstOrderException: Line search zoom failed

[1mMult

                                                                                

+------------------------------+------+
|Classifier                    |Result|
+------------------------------+------+
|LogisticRegression            |87.51 |
|OneVsRest                     |87.51 |
|LinearSVC                     |87.51 |
|NaiveBayes                    |87.51 |
|RandomForestClassifier        |90.58 |
|GBTClassifier                 |90.81 |
|DecisionTreeClassifier        |89.41 |
|MultilayerPerceptronClassifier|87.51 |
+------------------------------+------+

None


In [82]:
classifier = RandomForestClassifier()
featureDF = HTFfeaturizedData

train, test = featureDF.randomSplit([0.7, 0.3], seed=11)
features = featureDF.select(['features']).collect()

class_count = featureDF.select(countDistinct('label')).collect()
classes = class_count[0] [0]

ClassTrainEval(classifier, features, classes, train, test)

                                                                                

 
[1mRandomForestClassifier  Feature Importances[0m
(Scores add up to 1)
Lowest score is the least important
 
(20,[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19],[0.06613042512617784,0.04554856645813327,0.04719591687905851,0.04140765443892019,0.034164432874391845,0.042871097910953684,0.06420658048650599,0.04686836957596832,0.05540390641663283,0.05090374929963417,0.05375084590879853,0.05101024042835781,0.06062667383624228,0.0496448940342742,0.05223915749530694,0.05252174969723981,0.04588961324156382,0.05333072241519955,0.04791405599142485,0.03837134748521561])


DataFrame[Classifier: string, Result: string]

In [83]:
predictions = RF_BestModel.transform(test)
print("Predicted Non Fraudulent:")
predictions.select("fraudulent","label","description").filter("prediction=0").orderBy(predictions["prediction"].desc()).show(4, False)
print("Predicted Fraudulent:")
predictions.select("fraudulent","label","description",).filter("prediction=1").orderBy(predictions["prediction"].desc()).show(4, False)

Predicted Non Fraudulent:


                                                                                

+----------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------



+----------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|fraudulent|label|description                                                                                                     

                                                                                

In [84]:
spark.stop()