In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('income').getOrCreate()

In [2]:
from pyspark.sql.types import *

# schema = StructType([
#     StructField("age", IntegerType(), True), 
#     StructField("workclass", StringType(), True),
#     StructField("fnlwgt", FloatType(), True),
#     StructField("education", StringType(), True),
#     StructField("education-num", FloatType(), True),
#     StructField("marital-status", StringType(), True),
#     StructField("occupation", StringType(), True),
#     StructField("relationship", StringType(), True),
#     StructField("race", StringType(), True),
#     StructField("sex", StringType(), True),
#     StructField("capital-gain", FloatType(), True),
#     StructField("capital-loss", FloatType(), True),
#     StructField("hours-per-week", FloatType(), True),
#     StructField("native-country", StringType(), True),
#     StructField("class", StringType(), True)]
# )

# train = spark.read.csv('./adult.data.txt', schema=schema, inferSchema='true')

headers = ["age", "workclass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex",
           "capital-gain", "capital-loss", "hours-per-week", "native-country",
           "class"]

train = spark.read.csv('./adult.data.txt', inferSchema='true').toDF(*headers)
train.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- class: string (nullable = true)



In [3]:
labelCol = 'class'
train.count()

32561

# EDA

In [4]:
train.groupby(labelCol).count().show()

+------+-----+
| class|count|
+------+-----+
|  >50K| 7841|
| <=50K|24720|
+------+-----+



We can see there is a class imbalance problem in our training set.

In [5]:
from pyspark.sql.functions import col

def findMissingValuesCols(df):
    numRows = df.count()
    nullCols = []
    for column in df.columns:
        c = df.filter(col(column).isNotNull()).count()
        if c != numRows:
            nullCols.append(c)
    return nullCols

findMissingValuesCols(train)

[]

In [6]:
train.select('age').distinct().show()

+---+
|age|
+---+
| 31|
| 85|
| 65|
| 53|
| 78|
| 34|
| 81|
| 28|
| 76|
| 27|
| 26|
| 44|
| 22|
| 47|
| 52|
| 86|
| 40|
| 20|
| 57|
| 54|
+---+
only showing top 20 rows



In [7]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import format_number

def crosstabPercentage(df, col1, col2):
    ctabDf = df.crosstab(col1, col2)
    ctabCol = col1 + '_' + col2
    ctabNewCol = col1 + col2.title()
    ctabDf = ctabDf.withColumn(ctabNewCol, ctabDf[ctabCol])\
                                         .orderBy(ctabNewCol).drop(ctabCol)
    # Strip extra whitespaces from column name
    for column in ctabDf.columns:
        columnStripped = column.strip()
        if column != columnStripped:
            ctabDf = ctabDf.withColumn(column.strip(), ctabDf[column])\
                                         .drop(column)

    ctabDf = ctabDf.withColumn('percentage-of->50K', ctabDf['>50K']/(ctabDf['<=50K']+ctabDf['>50K'])*100)
    return ctabDf


In [8]:
df = crosstabPercentage(train, 'race', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()

+-------------------+-----+----+------------------+
|          raceClass|<=50K|>50K|percentage-of->50K|
+-------------------+-----+----+------------------+
|              Other|  246|  25|              9.23|
| Amer-Indian-Eskimo|  275|  36|             11.58|
|              Black| 2737| 387|             12.39|
|              White|20699|7117|             25.59|
| Asian-Pac-Islander|  763| 276|             26.56|
+-------------------+-----+----+------------------+



In [9]:
df = crosstabPercentage(train, 'age', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('ageClass').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show(df.count())

+--------+-----+----+------------------+
|ageClass|<=50K|>50K|percentage-of->50K|
+--------+-----+----+------------------+
|      17|  395|   0|              0.00|
|      18|  550|   0|              0.00|
|      19|  710|   2|              0.28|
|      20|  753|   0|              0.00|
|      21|  717|   3|              0.42|
|      22|  752|  13|              1.70|
|      23|  865|  12|              1.37|
|      24|  767|  31|              3.88|
|      25|  788|  53|              6.30|
|      26|  722|  63|              8.03|
|      27|  754|  81|              9.70|
|      28|  748| 119|             13.73|
|      29|  679| 134|             16.48|
|      30|  690| 171|             19.86|
|      31|  705| 183|             20.61|
|      32|  639| 189|             22.83|
|      33|  684| 191|             21.83|
|      34|  643| 243|             27.43|
|      35|  659| 217|             24.77|
|      36|  635| 263|             29.29|
|      37|  566| 292|             34.03|
|      38|  545|

In [10]:
df = crosstabPercentage(train, 'sex', labelCol)
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()


+--------+-----+----+------------------+
|sexClass|<=50K|>50K|percentage-of->50K|
+--------+-----+----+------------------+
|  Female| 9592|1179|             10.95|
|    Male|15128|6662|             30.57|
+--------+-----+----+------------------+



In [11]:
df = crosstabPercentage(train, 'education', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()


+--------------+-----+----+------------------+
|educationClass|<=50K|>50K|percentage-of->50K|
+--------------+-----+----+------------------+
|     Preschool|   51|   0|              0.00|
|       1st-4th|  162|   6|              3.57|
|       5th-6th|  317|  16|              4.80|
|          11th| 1115|  60|              5.11|
|           9th|  487|  27|              5.25|
|       7th-8th|  606|  40|              6.19|
|          10th|  871|  62|              6.65|
|          12th|  400|  33|              7.62|
|       HS-grad| 8826|1675|             15.95|
|  Some-college| 5904|1387|             19.02|
|    Assoc-acdm|  802| 265|             24.84|
|     Assoc-voc| 1021| 361|             26.12|
|     Bachelors| 3134|2221|             41.48|
|       Masters|  764| 959|             55.66|
|   Prof-school|  153| 423|             73.44|
|     Doctorate|  107| 306|             74.09|
+--------------+-----+----+------------------+



In [12]:
educationNumClass = crosstabPercentage(train, 'education-num', labelCol)
educationNumClass = educationNumClass.withColumn('percentage-of->50K', 
                                    format_number(educationNumClass['percentage-of->50K'], 2))
educationNumClass = educationNumClass.withColumn('education-numClassF', educationNumClass['education-numClass'].cast(DoubleType()))\
                                     .orderBy('education-numClassF').drop('education-numClass')
cols = educationNumClass.columns
cols.remove('education-numClassF')
cols.insert(0, 'education-numClassF')
educationNumClass = educationNumClass.select(cols)
educationNumClass.show()


+-------------------+-----+----+------------------+
|education-numClassF|<=50K|>50K|percentage-of->50K|
+-------------------+-----+----+------------------+
|                1.0|   51|   0|              0.00|
|                2.0|  162|   6|              3.57|
|                3.0|  317|  16|              4.80|
|                4.0|  606|  40|              6.19|
|                5.0|  487|  27|              5.25|
|                6.0|  871|  62|              6.65|
|                7.0| 1115|  60|              5.11|
|                8.0|  400|  33|              7.62|
|                9.0| 8826|1675|             15.95|
|               10.0| 5904|1387|             19.02|
|               11.0| 1021| 361|             26.12|
|               12.0|  802| 265|             24.84|
|               13.0| 3134|2221|             41.48|
|               14.0|  764| 959|             55.66|
|               15.0|  153| 423|             73.44|
|               16.0|  107| 306|             74.09|
+-----------

In [13]:
train.crosstab('education-num', 'education').show()


+-----------------------+-----+-----+-----+--------+--------+--------+----+-----------+----------+----------+----------+--------+--------+----------+------------+-------------+
|education-num_education| 10th| 11th| 12th| 1st-4th| 5th-6th| 7th-8th| 9th| Assoc-acdm| Assoc-voc| Bachelors| Doctorate| HS-grad| Masters| Preschool| Prof-school| Some-college|
+-----------------------+-----+-----+-----+--------+--------+--------+----+-----------+----------+----------+----------+--------+--------+----------+------------+-------------+
|                    5.0|    0|    0|    0|       0|       0|       0| 514|          0|         0|         0|         0|       0|       0|         0|           0|            0|
|                   10.0|    0|    0|    0|       0|       0|       0|   0|          0|         0|         0|         0|       0|       0|         0|           0|         7291|
|                   14.0|    0|    0|    0|       0|       0|       0|   0|          0|         0|         0|      

In [14]:
df = crosstabPercentage(train, 'workclass', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()

+-----------------+-----+----+------------------+
|   workclassClass|<=50K|>50K|percentage-of->50K|
+-----------------+-----+----+------------------+
|     Never-worked|    7|   0|              0.00|
|      Without-pay|   14|   0|              0.00|
|                ?| 1645| 191|             10.40|
|          Private|17733|4963|             21.87|
|        State-gov|  945| 353|             27.20|
| Self-emp-not-inc| 1817| 724|             28.49|
|        Local-gov| 1476| 617|             29.48|
|      Federal-gov|  589| 371|             38.65|
|     Self-emp-inc|  494| 622|             55.73|
+-----------------+-----+----+------------------+



In [21]:
df = crosstabPercentage(train, 'capital-gain', labelCol).orderBy('percentage-of->50K')
df = df.orderBy('percentage-of->50K').withColumn('percentage-of->50K', 
                                    format_number(df['percentage-of->50K'], 2))
df.show()

+-----------------+-----+----+------------------+
|capital-gainClass|<=50K|>50K|percentage-of->50K|
+-----------------+-----+----+------------------+
|           2062.0|    2|   0|              0.00|
|           2036.0|    4|   0|              0.00|
|           2050.0|    5|   0|              0.00|
|           2009.0|    3|   0|              0.00|
|           1455.0|    1|   0|              0.00|
|           1848.0|    6|   0|              0.00|
|           1831.0|    7|   0|              0.00|
|           1151.0|    8|   0|              0.00|
|            114.0|    6|   0|              0.00|
|           1424.0|    3|   0|              0.00|
|           1409.0|    7|   0|              0.00|
|           1506.0|   15|   0|              0.00|
|           1471.0|    7|   0|              0.00|
|           1639.0|    1|   0|              0.00|
|           1797.0|    7|   0|              0.00|
|          10566.0|    6|   0|              0.00|
|           1055.0|   25|   0|              0.00|


In [15]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml import Pipeline

def autoIndexer(df, lableCol):
    stringTypes = [dtype[0] for dtype in train.dtypes if dtype[1] == 'string']
    indexedTypes = [stringType+'Indexed' for stringType in stringTypes]
    try:
        indexedTypes.remove(lableCol+'Indexed')
    except:
        pass
    indexers = [StringIndexer(inputCol=stringType, outputCol=stringType+'Indexed') for stringType in stringTypes]
    oheTypes = [indexedType+'OneHotEncoded' for indexedType in indexedTypes]
    ohe = OneHotEncoderEstimator(inputCols=indexedTypes, outputCols=oheTypes)
    assembler = VectorAssembler(inputCols=oheTypes, outputCol='assembled')
    pipeline = Pipeline(stages=[*indexers, ohe, assembler])
#     assembler = VectorAssembler(inputCols=indexedTypes, outputCol='assembled')
#     pipeline = Pipeline(stages=[*indexers, assembler])
    
    indexed = pipeline.fit(df).transform(df)
    return stringTypes, oheTypes, indexed

_, indexedTypes, indexedDf = autoIndexer(train, labelCol)
# The null hypothesis is that the occurrence of the outcomes is statistically independent.
# In general, small p-values (1% to 5%) would cause you to reject the null hypothesis. 
# This very large p-value (92.65%) means that the null hypothesis should not be rejected.
testResult = ChiSquareTest.test(indexedDf, 'assembled', 'classIndexed')
r = testResult.head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))


pValues: [0.0,6.06621672894e-08,2.35683250693e-09,0.0,0.0073969884446,0.0,0.0,0.0350527043814,0.0,0.0,0.0,0.0,0.0698181823541,0.0,0.557536965197,0.0,0.0,0.0,0.0,7.77156117238e-16,0.0,1.11022302463e-16,4.56571558338e-10,0.0,0.0,0.0,0.0,0.0,1.65423230669e-14,0.0,0.0232540498908,0.0,0.0,1.91608792943e-05,0.0,0.0,0.0,0.000106491974853,0.0,0.0,3.53831313693e-06,3.89777829346e-07,2.10630402009e-11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0571069627625,2.18791128326e-07,0.0,4.9695980664e-10,0.0,0.583599768678,0.0263766737841,0.0274947609427,0.035664584289,0.000697135482754,0.000169808165581,0.000192370213887,0.609920556916,0.0398097636079,0.0133898002533,0.392700772659,0.600053722635,0.0419896247007,3.21813000361e-05,0.00144941904399,0.000281102267653,0.0070067904419,0.459306125248,0.000198893471588,0.0114124246819,0.0199611745997,0.00636173094476,0.0589039315422,0.0130248843455,0.0216336827864,0.0292840513559,0.658726919143,0.225228126282,0.709716719993,0.535731237288,0.166902075353,0.193152005373,0.19

In [16]:
testResult

DataFrame[pValues: vector, degreesOfFreedom: array<int>, statistics: vector]

In [17]:
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([(Vectors.dense(1.0, 0.0, 3.0), 
                             Vectors.dense(1.0, 0.0, 3.0), 
                             Vectors.dense(1.0, 0.0, 3.0))], 
                           ["a", "b", "c"])
vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features")
df = vecAssembler.transform(df)
df.select("features").show()

+--------------------+
|            features|
+--------------------+
|[1.0,0.0,3.0,1.0,...|
+--------------------+



In [18]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

stringTypes = [dtype[0] for dtype in train.dtypes if dtype[1] == 'string']
indexedTypes = [stringType+'Indexed' for stringType in stringTypes]

indexers = [StringIndexer(inputCol=stringType, outputCol=stringType+'Indexed') for stringType in stringTypes]
pipeline = Pipeline(stages=indexers)
train = pipeline.fit(train).transform(train)
train.show()

+---+-----------------+--------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+----------------+----------------+---------------------+-----------------+-------------------+-----------+----------+---------------------+------------+
|age|        workclass|  fnlwgt|    education|education-num|      marital-status|        occupation|  relationship|               race|    sex|capital-gain|capital-loss|hours-per-week|native-country| class|workclassIndexed|educationIndexed|marital-statusIndexed|occupationIndexed|relationshipIndexed|raceIndexed|sexIndexed|native-countryIndexed|classIndexed|
+---+-----------------+--------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+----------------+----------------+---------------------+-----------------+--

In [19]:
train.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- class: string (nullable = true)
 |-- workclassIndexed: double (nullable = false)
 |-- educationIndexed: double (nullable = false)
 |-- marital-statusIndexed: double (nullable = false)
 |-- occupationIndexed: double (nullable = false)
 |-- relationshipIndexed: double (nullable = false)
 |-- raceIndexed: double (nullable = false)
 |-- sexIndexed: double (nullable = false)
 |-- native-countryIndexed: doubl

In [20]:
from pyspark.ml.feature import OneHotEncoderEstimator, VectorAssembler

oheTypes = [indexedType+'OneHotEncoded' for indexedType in indexedTypes]
ohe = OneHotEncoderEstimator(inputCols=indexedTypes, outputCols=oheTypes)
assembler = VectorAssembler(inputCols=oheTypes, outputCol='assembled')
pipeline = Pipeline(stages=[])
train = ohe.fit(train).transform(train)
train

DataFrame[age: int, workclass: string, fnlwgt: double, education: string, education-num: double, marital-status: string, occupation: string, relationship: string, race: string, sex: string, capital-gain: double, capital-loss: double, hours-per-week: double, native-country: string, class: string, workclassIndexed: double, educationIndexed: double, marital-statusIndexed: double, occupationIndexed: double, relationshipIndexed: double, raceIndexed: double, sexIndexed: double, native-countryIndexed: double, classIndexed: double, workclassIndexedOneHotEncoded: vector, raceIndexedOneHotEncoded: vector, occupationIndexedOneHotEncoded: vector, relationshipIndexedOneHotEncoded: vector, educationIndexedOneHotEncoded: vector, native-countryIndexedOneHotEncoded: vector, marital-statusIndexedOneHotEncoded: vector, sexIndexedOneHotEncoded: vector, classIndexedOneHotEncoded: vector]