In [1]:
from pyspark.sql import SparkSession

import sys
sys.path.append('..')
from utils.pysparkutils import *

spark = SparkSession.builder.appName('income').getOrCreate()

In [2]:
from pyspark.sql.types import *

# schema = StructType([
#     StructField("age", IntegerType(), True), 
#     StructField("workclass", StringType(), True),
#     StructField("fnlwgt", FloatType(), True),
#     StructField("education", StringType(), True),
#     StructField("education-num", FloatType(), True),
#     StructField("marital-status", StringType(), True),
#     StructField("occupation", StringType(), True),
#     StructField("relationship", StringType(), True),
#     StructField("race", StringType(), True),
#     StructField("sex", StringType(), True),
#     StructField("capital-gain", FloatType(), True),
#     StructField("capital-loss", FloatType(), True),
#     StructField("hours-per-week", FloatType(), True),
#     StructField("native-country", StringType(), True),
#     StructField("class", StringType(), True)]
# )

# train = spark.read.csv('./adult.data.txt', schema=schema, inferSchema='true')

headers = ["age", "workclass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex",
           "capital-gain", "capital-loss", "hours-per-week", "native-country",
           "class"]

train = spark.read.csv('./adult.data.txt',
                       inferSchema='true', 
                       ignoreLeadingWhiteSpace='true',
                       ignoreTrailingWhiteSpace='true').toDF(*headers)
train.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: integer (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: integer (nullable = true)
 |-- capital-loss: integer (nullable = true)
 |-- hours-per-week: integer (nullable = true)
 |-- native-country: string (nullable = true)
 |-- class: string (nullable = true)



In [3]:
from pyspark.sql.functions import udf, monotonically_increasing_id

train = train.withColumn('index', monotonically_increasing_id())

In [4]:
labelCol = 'class'
train.count()

32561

# Exploratory Data Analysis

In [5]:
train.groupby(labelCol).count().show()

+-----+-----+
|class|count|
+-----+-----+
|<=50K|24720|
| >50K| 7841|
+-----+-----+



We can see there is a class imbalance problem in our training set.

In [6]:
findMissingValuesCols(train)

[]

In [7]:
train.select('age').distinct().show()

+---+
|age|
+---+
| 31|
| 85|
| 65|
| 53|
| 78|
| 34|
| 81|
| 28|
| 76|
| 27|
| 26|
| 44|
| 22|
| 47|
| 52|
| 86|
| 40|
| 20|
| 57|
| 54|
+---+
only showing top 20 rows



In [8]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import format_number, lit

def crosstabPercentage(df, col1, col2):
    df2 = df.groupby(col1, col2).count()
    count = df.count()
    df2 = df2.withColumn('percentage', col('count') / count * 100)
    return df2


In [9]:
percentageCol = 'percentage'
df = crosstabPercentage(train, 'race', labelCol).orderBy(percentageCol)
df = df.orderBy(percentageCol).withColumn(percentageCol, 
                                    format_number(df[percentageCol], 2))
df.show()

+------------------+-----+-----+----------+
|              race|class|count|percentage|
+------------------+-----+-----+----------+
|             Other| >50K|   25|      0.08|
|Amer-Indian-Eskimo| >50K|   36|      0.11|
|             Other|<=50K|  246|      0.76|
|Amer-Indian-Eskimo|<=50K|  275|      0.84|
|Asian-Pac-Islander| >50K|  276|      0.85|
|             Black| >50K|  387|      1.19|
|Asian-Pac-Islander|<=50K|  763|      2.34|
|             Black|<=50K| 2737|      8.41|
|             White| >50K| 7117|     21.86|
|             White|<=50K|20699|     63.57|
+------------------+-----+-----+----------+



In [10]:
df = crosstabPercentage(train, 'age', labelCol).orderBy(percentageCol)
df = df.orderBy('ageClass').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show(df.count())

AnalysisException: "cannot resolve '`ageClass`' given input columns: [age, class, count, percentage];;\n'Sort ['ageClass ASC NULLS FIRST], true\n+- AnalysisBarrier\n      +- Sort [percentage#628 ASC NULLS FIRST], true\n         +- Project [age#40, class#54, count#603L, ((cast(count#603L as double) / cast(32561 as double)) * cast(100 as double)) AS percentage#628]\n            +- Aggregate [age#40, class#54], [age#40, class#54, count(1) AS count#603L]\n               +- Project [age#40, workclass#41, fnlwgt#42, education#43, education-num#44, marital-status#45, occupation#46, relationship#47, race#48, sex#49, capital-gain#50, capital-loss#51, hours-per-week#52, native-country#53, class#54, monotonically_increasing_id() AS index#70L]\n                  +- Project [_c0#10 AS age#40, _c1#11 AS workclass#41, _c2#12 AS fnlwgt#42, _c3#13 AS education#43, _c4#14 AS education-num#44, _c5#15 AS marital-status#45, _c6#16 AS occupation#46, _c7#17 AS relationship#47, _c8#18 AS race#48, _c9#19 AS sex#49, _c10#20 AS capital-gain#50, _c11#21 AS capital-loss#51, _c12#22 AS hours-per-week#52, _c13#23 AS native-country#53, _c14#24 AS class#54]\n                     +- Relation[_c0#10,_c1#11,_c2#12,_c3#13,_c4#14,_c5#15,_c6#16,_c7#17,_c8#18,_c9#19,_c10#20,_c11#21,_c12#22,_c13#23,_c14#24] csv\n"

In [None]:
df = crosstabPercentage(train, 'sex', labelCol)
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()


In [None]:
df = crosstabPercentage(train, 'education', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()


In [None]:
educationNumClass = crosstabPercentage(train, 'education-num', labelCol)
educationNumClass = educationNumClass.withColumn('percentage-of->50K', 
                                    format_number(educationNumClass['percentage-of->50K'], 2))
educationNumClass = educationNumClass.withColumn('education-numClassF', educationNumClass['education-numClass'].cast(DoubleType()))\
                                     .orderBy('education-numClassF').drop('education-numClass')
cols = educationNumClass.columns
cols.remove('education-numClassF')
cols.insert(0, 'education-numClassF')
educationNumClass = educationNumClass.select(cols)
educationNumClass.show()


In [None]:
df = train.crosstab('education-num', 'education')
df.show()

We can see above that this is a sparse matrix, it's hard to find the non-zero values. So we will only focus on non-zero values to find out whether there is any relationship between these features and one of them is redundant.

In [None]:
from pyspark.sql.functions import coalesce, lit, when

iterator = df.toLocalIterator()
d = {}
for row in iterator:
    rowDict = row.asDict()
    educationNum = rowDict['education-num_education']
    for k, v in rowDict.items():
        if k != 'education-num_education' and v != 0:
            d[educationNum+'_'+k] = v

import json
s = json.dumps(d, indent=4)
print(s)

We can see it's obvious that these features are redundant. Only one of them should suffice for our classification task.

Let's try more rigorous chi square test instead of something hand-wavy.

First we will define an utility method that'll index the catgorical string columns, encodes them into one-hot-encoded vectors, and finally assemble all the feature vectos into once vector for later downstream analysis.

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml import Pipeline

def autoIndexer(df, lableCol, outputCol='assembled'):
    stringTypes = [dtype[0] for dtype in df.dtypes if dtype[1] == 'string']
    indexedTypes = [stringType+'Indexed' for stringType in stringTypes]
    try:
        indexedTypes.remove(lableCol+'Indexed')
    except:
        pass
    indexers = [StringIndexer(inputCol=stringType, outputCol=stringType+'Indexed') for stringType in stringTypes]
    oheTypes = [indexedType+'OneHotEncoded' for indexedType in indexedTypes]
    ohe = OneHotEncoderEstimator(inputCols=indexedTypes, outputCols=oheTypes)
    assembler = VectorAssembler(inputCols=oheTypes, outputCol=outputCol)
    pipeline = Pipeline(stages=[*indexers, ohe, assembler])    
    indexed = pipeline.fit(df).transform(df)
    return stringTypes, oheTypes, indexed

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml import Pipeline

indexed = train.select('education-num', 'education')

indexer = StringIndexer(inputCol='education', outputCol='educationIndexed')
indexed = indexer.fit(indexed).transform(indexed)
ohe = OneHotEncoderEstimator(inputCols=['education-num',], outputCols=['education-numOHE',])
indexed = ohe.fit(indexed).transform(indexed)

# The null hypothesis is that the occurrence of the outcomes is statistically independent.
# In general, small p-values (1% to 5%) would cause you to reject the null hypothesis. 
# This very large p-value (92.65%) means that the null hypothesis should not be rejected.
testResult = ChiSquareTest.test(indexed, 'education-numOHE', 'educationIndexed')
r = testResult.head()
print("pValues: " + str(r.pValues))

We can accept the hypothesis that features are dependent. We will drop the 'education' feature since the info. is covered

In [None]:
train = train.drop('education')

In [None]:
df = crosstabPercentage(train, 'workclass', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()

In [None]:
df = crosstabPercentage(train, 'hours-per-week', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()

In [None]:
_, indexedTypes, indexedDf = autoIndexer(train, labelCol)
# The null hypothesis is that the occurrence of the outcomes is statistically independent.
# In general, small p-values (1% to 5%) would cause you to reject the null hypothesis. 
# This very large p-value (92.65%) means that the null hypothesis should not be rejected.
testResult = ChiSquareTest.test(indexedDf, 'assembled', 'classIndexed')
r = testResult.head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))


In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=2, featuresCol='assembled')
model = kmeans.fit(indexedDf)
indexedDf = model.transform(indexedDf)

In [None]:
indexedDf.select('prediction', 'classIndexed').show()

# Classification

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

stringTypes = [dtype[0] for dtype in train.dtypes if dtype[1] == 'string']
indexedTypes = [stringType+'Indexed' for stringType in stringTypes]

indexers = [StringIndexer(inputCol=stringType, outputCol=stringType+'Indexed', handleInvalid='skip') \
            for stringType in stringTypes]

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.classification import GBTClassifier

oheTypes = [indexedType+'OneHotEncoded' for indexedType in indexedTypes]
ohe = OneHotEncoderEstimator(inputCols=indexedTypes, outputCols=oheTypes)

# Fix columns
oheTypes.remove('classIndexedOneHotEncoded')
cols = train.columns[:]
for oheType in oheTypes:
    cols.append(oheType)
for stringType in stringTypes:
    cols.remove(stringType)

cols.remove('index')

assembler = VectorAssembler(inputCols=cols, outputCol='assembled')
classifier = GBTClassifier(featuresCol='assembled', labelCol='classIndexed')
pipeline = Pipeline(stages=[*indexers, ohe, assembler, classifier])
model = pipeline.fit(train)
train = model.transform(train)
train

Since we have class imbalance problem, that's why we will use area under ROC curve as metric.

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='classIndexed', metricName='areaUnderROC')
metric = evaluator.evaluate(train)
metric

In [None]:
classifier = model.stages[-1]
ohe = model.stages[-3]
ohe

In [None]:
headers = ["age", "workclass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex",
           "capital-gain", "capital-loss", "hours-per-week", "native-country",
           "class"]

test = spark.read.csv('./adult.test.txt',
                      inferSchema='true', 
                      ignoreLeadingWhiteSpace='true',
                      ignoreTrailingWhiteSpace='true').toDF(*headers)
test.select('class').show()

We can see the class labels in the test dataset are different than in train - '>50K' and '>50K.'. So we have to remove the extrac dot from the class label, before evaluating.

In [None]:
from pyspark.sql.types import StringType
stripDot = udf(lambda s: s[:-1], StringType())

test = test.withColumn('classTrailed', stripDot(test['class'])).drop('class').withColumnRenamed('classTrailed', 'class')
test.select('class').show()

In [None]:
test = model.transform(test)
metric = evaluator.evaluate(test)
metric
