In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('income').getOrCreate()

In [2]:
from pyspark.sql.types import *

# schema = StructType([
#     StructField("age", IntegerType(), True), 
#     StructField("workclass", StringType(), True),
#     StructField("fnlwgt", FloatType(), True),
#     StructField("education", StringType(), True),
#     StructField("education-num", FloatType(), True),
#     StructField("marital-status", StringType(), True),
#     StructField("occupation", StringType(), True),
#     StructField("relationship", StringType(), True),
#     StructField("race", StringType(), True),
#     StructField("sex", StringType(), True),
#     StructField("capital-gain", FloatType(), True),
#     StructField("capital-loss", FloatType(), True),
#     StructField("hours-per-week", FloatType(), True),
#     StructField("native-country", StringType(), True),
#     StructField("class", StringType(), True)]
# )

# train = spark.read.csv('./adult.data.txt', schema=schema, inferSchema='true')

headers = ["age", "workclass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex",
           "capital-gain", "capital-loss", "hours-per-week", "native-country",
           "class"]

train = spark.read.csv('./adult.data.txt', inferSchema='true').toDF(*headers)
train.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- class: string (nullable = true)



In [3]:
from pyspark.sql.functions import udf, monotonically_increasing_id

train = train.withColumn('index', monotonically_increasing_id())

In [4]:
labelCol = 'class'
train.count()

32561

# Exploratory Data Analysis

In [5]:
train.groupby(labelCol).count().show()

+------+-----+
| class|count|
+------+-----+
|  >50K| 7841|
| <=50K|24720|
+------+-----+



We can see there is a class imbalance problem in our training set.

In [6]:
from pyspark.sql.functions import col

def findMissingValuesCols(df):
    numRows = df.count()
    nullCols = []
    for column in df.columns:
        c = df.filter(col(column).isNotNull()).count()
        if c != numRows:
            nullCols.append(c)
    return nullCols

findMissingValuesCols(train)

[]

In [7]:
train.select('age').distinct().show()

+---+
|age|
+---+
| 31|
| 85|
| 65|
| 53|
| 78|
| 34|
| 81|
| 28|
| 76|
| 27|
| 26|
| 44|
| 22|
| 47|
| 52|
| 86|
| 40|
| 20|
| 57|
| 54|
+---+
only showing top 20 rows



In [8]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import format_number

def crosstabPercentage(df, col1, col2):
    ctabDf = df.crosstab(col1, col2)
    ctabCol = col1 + '_' + col2
    ctabNewCol = col1 + col2.title()
    ctabDf = ctabDf.withColumn(ctabNewCol, ctabDf[ctabCol])\
                                         .orderBy(ctabNewCol).drop(ctabCol)
    # Strip extra whitespaces from column name
    for column in ctabDf.columns:
        columnStripped = column.strip()
        if column != columnStripped:
            ctabDf = ctabDf.withColumn(column.strip(), ctabDf[column])\
                                         .drop(column)

    ctabDf = ctabDf.withColumn('percentage-of->50K', ctabDf['>50K']/(ctabDf['<=50K']+ctabDf['>50K'])*100)
    return ctabDf


In [9]:
df = crosstabPercentage(train, 'race', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()

+-------------------+-----+----+------------------+
|          raceClass|<=50K|>50K|percentage-of->50K|
+-------------------+-----+----+------------------+
|              Other|  246|  25|              9.23|
| Amer-Indian-Eskimo|  275|  36|             11.58|
|              Black| 2737| 387|             12.39|
|              White|20699|7117|             25.59|
| Asian-Pac-Islander|  763| 276|             26.56|
+-------------------+-----+----+------------------+



In [10]:
df = crosstabPercentage(train, 'age', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('ageClass').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show(df.count())

+--------+-----+----+------------------+
|ageClass|<=50K|>50K|percentage-of->50K|
+--------+-----+----+------------------+
|      17|  395|   0|              0.00|
|      18|  550|   0|              0.00|
|      19|  710|   2|              0.28|
|      20|  753|   0|              0.00|
|      21|  717|   3|              0.42|
|      22|  752|  13|              1.70|
|      23|  865|  12|              1.37|
|      24|  767|  31|              3.88|
|      25|  788|  53|              6.30|
|      26|  722|  63|              8.03|
|      27|  754|  81|              9.70|
|      28|  748| 119|             13.73|
|      29|  679| 134|             16.48|
|      30|  690| 171|             19.86|
|      31|  705| 183|             20.61|
|      32|  639| 189|             22.83|
|      33|  684| 191|             21.83|
|      34|  643| 243|             27.43|
|      35|  659| 217|             24.77|
|      36|  635| 263|             29.29|
|      37|  566| 292|             34.03|
|      38|  545|

In [11]:
df = crosstabPercentage(train, 'sex', labelCol)
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()


+--------+-----+----+------------------+
|sexClass|<=50K|>50K|percentage-of->50K|
+--------+-----+----+------------------+
|  Female| 9592|1179|             10.95|
|    Male|15128|6662|             30.57|
+--------+-----+----+------------------+



In [12]:
df = crosstabPercentage(train, 'education', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()


+--------------+-----+----+------------------+
|educationClass|<=50K|>50K|percentage-of->50K|
+--------------+-----+----+------------------+
|     Preschool|   51|   0|              0.00|
|       1st-4th|  162|   6|              3.57|
|       5th-6th|  317|  16|              4.80|
|          11th| 1115|  60|              5.11|
|           9th|  487|  27|              5.25|
|       7th-8th|  606|  40|              6.19|
|          10th|  871|  62|              6.65|
|          12th|  400|  33|              7.62|
|       HS-grad| 8826|1675|             15.95|
|  Some-college| 5904|1387|             19.02|
|    Assoc-acdm|  802| 265|             24.84|
|     Assoc-voc| 1021| 361|             26.12|
|     Bachelors| 3134|2221|             41.48|
|       Masters|  764| 959|             55.66|
|   Prof-school|  153| 423|             73.44|
|     Doctorate|  107| 306|             74.09|
+--------------+-----+----+------------------+



In [13]:
educationNumClass = crosstabPercentage(train, 'education-num', labelCol)
educationNumClass = educationNumClass.withColumn('percentage-of->50K', 
                                    format_number(educationNumClass['percentage-of->50K'], 2))
educationNumClass = educationNumClass.withColumn('education-numClassF', educationNumClass['education-numClass'].cast(DoubleType()))\
                                     .orderBy('education-numClassF').drop('education-numClass')
cols = educationNumClass.columns
cols.remove('education-numClassF')
cols.insert(0, 'education-numClassF')
educationNumClass = educationNumClass.select(cols)
educationNumClass.show()


+-------------------+-----+----+------------------+
|education-numClassF|<=50K|>50K|percentage-of->50K|
+-------------------+-----+----+------------------+
|                1.0|   51|   0|              0.00|
|                2.0|  162|   6|              3.57|
|                3.0|  317|  16|              4.80|
|                4.0|  606|  40|              6.19|
|                5.0|  487|  27|              5.25|
|                6.0|  871|  62|              6.65|
|                7.0| 1115|  60|              5.11|
|                8.0|  400|  33|              7.62|
|                9.0| 8826|1675|             15.95|
|               10.0| 5904|1387|             19.02|
|               11.0| 1021| 361|             26.12|
|               12.0|  802| 265|             24.84|
|               13.0| 3134|2221|             41.48|
|               14.0|  764| 959|             55.66|
|               15.0|  153| 423|             73.44|
|               16.0|  107| 306|             74.09|
+-----------

In [14]:
df = train.crosstab('education-num', 'education')
df.show()

+-----------------------+-----+-----+-----+--------+--------+--------+----+-----------+----------+----------+----------+--------+--------+----------+------------+-------------+
|education-num_education| 10th| 11th| 12th| 1st-4th| 5th-6th| 7th-8th| 9th| Assoc-acdm| Assoc-voc| Bachelors| Doctorate| HS-grad| Masters| Preschool| Prof-school| Some-college|
+-----------------------+-----+-----+-----+--------+--------+--------+----+-----------+----------+----------+----------+--------+--------+----------+------------+-------------+
|                    5.0|    0|    0|    0|       0|       0|       0| 514|          0|         0|         0|         0|       0|       0|         0|           0|            0|
|                   10.0|    0|    0|    0|       0|       0|       0|   0|          0|         0|         0|         0|       0|       0|         0|           0|         7291|
|                   14.0|    0|    0|    0|       0|       0|       0|   0|          0|         0|         0|      

We can see above that this is a sparse matrix, it's hard to find the non-zero values. So we will only focus on non-zero values to find out whether there is any relationship between these features and one of them is redundant.

In [15]:
from pyspark.sql.functions import coalesce, lit, when

iterator = df.toLocalIterator()
d = {}
for row in iterator:
    rowDict = row.asDict()
    educationNum = rowDict['education-num_education']
    for k, v in rowDict.items():
        if k != 'education-num_education' and v != 0:
            d[educationNum+'_'+k] = v

import json
s = json.dumps(d, indent=4)
print(s)

{
    "5.0_ 9th": 514,
    "10.0_ Some-college": 7291,
    "14.0_ Masters": 1723,
    "1.0_ Preschool": 51,
    "6.0_ 10th": 933,
    "9.0_ HS-grad": 10501,
    "13.0_ Bachelors": 5355,
    "2.0_ 1st-4th": 168,
    "12.0_ Assoc-acdm": 1067,
    "7.0_ 11th": 1175,
    "3.0_ 5th-6th": 333,
    "16.0_ Doctorate": 413,
    "11.0_ Assoc-voc": 1382,
    "8.0_ 12th": 433,
    "4.0_ 7th-8th": 646,
    "15.0_ Prof-school": 576
}


We can see it's obvious that these features are redundant. Only one of them should suffice for our classification task.

Let's try more rigorous chi square test instead of something hand-wavy.

First we will define an utility method that'll index the catgorical string columns, encodes them into one-hot-encoded vectors, and finally assemble all the feature vectos into once vector for later downstream analysis.

In [16]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml import Pipeline

def autoIndexer(df, lableCol, outputCol='assembled'):
    stringTypes = [dtype[0] for dtype in df.dtypes if dtype[1] == 'string']
    indexedTypes = [stringType+'Indexed' for stringType in stringTypes]
    try:
        indexedTypes.remove(lableCol+'Indexed')
    except:
        pass
    indexers = [StringIndexer(inputCol=stringType, outputCol=stringType+'Indexed') for stringType in stringTypes]
    oheTypes = [indexedType+'OneHotEncoded' for indexedType in indexedTypes]
    ohe = OneHotEncoderEstimator(inputCols=indexedTypes, outputCols=oheTypes)
    assembler = VectorAssembler(inputCols=oheTypes, outputCol=outputCol)
    pipeline = Pipeline(stages=[*indexers, ohe, assembler])    
    indexed = pipeline.fit(df).transform(df)
    return stringTypes, oheTypes, indexed

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml import Pipeline

indexed = train.select('education-num', 'education')

indexer = StringIndexer(inputCol='education', outputCol='educationIndexed')
indexed = indexer.fit(indexed).transform(indexed)
ohe = OneHotEncoderEstimator(inputCols=['education-num',], outputCols=['education-numOHE',])
indexed = ohe.fit(indexed).transform(indexed)

# The null hypothesis is that the occurrence of the outcomes is statistically independent.
# In general, small p-values (1% to 5%) would cause you to reject the null hypothesis. 
# This very large p-value (92.65%) means that the null hypothesis should not be rejected.
testResult = ChiSquareTest.test(indexed, 'education-numOHE', 'educationIndexed')
r = testResult.head()
print("pValues: " + str(r.pValues))

We can accept the hypothesis that features are dependent. We will drop the 'education' feature since the info. is covered

In [None]:
train = train.drop('education')

In [None]:
df = crosstabPercentage(train, 'workclass', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()

In [None]:
df = crosstabPercentage(train, 'hours-per-week', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()

In [None]:
_, indexedTypes, indexedDf = autoIndexer(train, labelCol)
# The null hypothesis is that the occurrence of the outcomes is statistically independent.
# In general, small p-values (1% to 5%) would cause you to reject the null hypothesis. 
# This very large p-value (92.65%) means that the null hypothesis should not be rejected.
testResult = ChiSquareTest.test(indexedDf, 'assembled', 'classIndexed')
r = testResult.head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))


In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans(k=2, featuresCol='assembled')
model = kmeans.fit(indexedDf)
indexedDf = model.transform(indexedDf)

In [None]:
indexedDf.select('prediction', 'classIndexed').show()

# Classification

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

stringTypes = [dtype[0] for dtype in train.dtypes if dtype[1] == 'string']
indexedTypes = [stringType+'Indexed' for stringType in stringTypes]

indexers = [StringIndexer(inputCol=stringType, outputCol=stringType+'Indexed') for stringType in stringTypes]

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.classification import GBTClassifier

oheTypes = [indexedType+'OneHotEncoded' for indexedType in indexedTypes]
ohe = OneHotEncoderEstimator(inputCols=indexedTypes, outputCols=oheTypes)

# Fix columns
oheTypes.remove('classIndexedOneHotEncoded')
cols = train.columns[:]
for oheType in oheTypes:
    cols.append(oheType)
for stringType in stringTypes:
    cols.remove(stringType)

cols.remove('index')

assembler = VectorAssembler(inputCols=cols, outputCol='assembled')
classifier = GBTClassifier(featuresCol='assembled', labelCol='classIndexed')
pipeline = Pipeline(stages=[*indexers, ohe, assembler, classifier])
model = pipeline.fit(train)
train = model.transform(train)
train

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator(labelCol='classIndexed')
metric = evaluator.evaluate(train)
metric

In [None]:
classifier = model.stages[-1]
ohe = model.stages[-3]
ohe