In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('income').getOrCreate()

In [2]:
from pyspark.sql.types import *

# schema = StructType([
#     StructField("age", IntegerType(), True), 
#     StructField("workclass", StringType(), True),
#     StructField("fnlwgt", FloatType(), True),
#     StructField("education", StringType(), True),
#     StructField("education-num", FloatType(), True),
#     StructField("marital-status", StringType(), True),
#     StructField("occupation", StringType(), True),
#     StructField("relationship", StringType(), True),
#     StructField("race", StringType(), True),
#     StructField("sex", StringType(), True),
#     StructField("capital-gain", FloatType(), True),
#     StructField("capital-loss", FloatType(), True),
#     StructField("hours-per-week", FloatType(), True),
#     StructField("native-country", StringType(), True),
#     StructField("class", StringType(), True)]
# )

# train = spark.read.csv('./adult.data.txt', schema=schema, inferSchema='true')

headers = ["age", "workclass", "fnlwgt", "education", "education-num",
           "marital-status", "occupation", "relationship", "race", "sex",
           "capital-gain", "capital-loss", "hours-per-week", "native-country",
           "class"]

train = spark.read.csv('./adult.data.txt', inferSchema='true').toDF(*headers)
train.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: double (nullable = true)
 |-- education: string (nullable = true)
 |-- education-num: double (nullable = true)
 |-- marital-status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital-gain: double (nullable = true)
 |-- capital-loss: double (nullable = true)
 |-- hours-per-week: double (nullable = true)
 |-- native-country: string (nullable = true)
 |-- class: string (nullable = true)



In [3]:
labelCol = 'class'
train.count()

32561

# EDA

In [4]:
train.groupby('class').count().show()

+------+-----+
| class|count|
+------+-----+
|  >50K| 7841|
| <=50K|24720|
+------+-----+



In [5]:
from pyspark.sql.functions import col

def findMissingValuesCols(df):
    numRows = df.count()
    nullCols = []
    for column in df.columns:
        c = df.filter(col(column).isNotNull()).count()
        if c != numRows:
            nullCols.append(c)
    return nullCols

findMissingValuesCols(train)

[]

In [None]:
train.select('age').distinct().show()

In [None]:
from pyspark.sql.types import DoubleType
from pyspark.sql.functions import format_number

def crosstabPercentage(df, col1, col2):
    ctabDf = df.crosstab(col1, col2)
    ctabCol = col1 + '_' + col2
    ctabNewCol = col1 + col2.title()
    ctabDf = ctabDf.withColumn(ctabNewCol, ctabDf[ctabCol])\
                                         .orderBy(ctabNewCol).drop(ctabCol)
    # Strip extra whitespaces from column name
    for column in ctabDf.columns:
        columnStripped = column.strip()
        if column != columnStripped:
            ctabDf = ctabDf.withColumn(column.strip(), ctabDf[column])\
                                         .drop(column)

    ctabDf = ctabDf.withColumn('percentage-of->50K', 
                                                     format_number(ctabDf['>50K']/(ctabDf['<=50K']+ctabDf['>50K'])*100, 2))
    return ctabDf


In [None]:
df = crosstabPercentage(train, 'race', 'class')
df.show()

In [None]:
df = crosstabPercentage(train, 'sex', 'class')
df.show()


In [None]:
df = crosstabPercentage(train, 'education', 'class')
df.show()


In [None]:
educationNumClass = crosstabPercentage(train, 'education-num', 'class')
educationNumClass = educationNumClass.withColumn('education-numClassF', educationNumClass['education-numClass'].cast(DoubleType()))\
                                     .orderBy('education-numClassF').drop('education-numClass')
cols = educationNumClass.columns
cols.remove('education-numClassF')
cols.insert(0, 'education-numClassF')
educationNumClass = educationNumClass.select(cols)
educationNumClass.show()


In [None]:
# train.crosstab('workclass', 'class').show()
df = crosstabPercentage(train, 'workclass', 'class')
df.show()

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, StringIndexer, VectorAssembler
from pyspark.ml.stat import ChiSquareTest
from pyspark.ml import Pipeline

def autoIndexer(df, lableCol):
    stringTypes = [dtype[0] for dtype in train.dtypes if dtype[1] == 'string']
    indexedTypes = [stringType+'Indexed' for stringType in stringTypes]
    try:
        indexedTypes.remove(lableCol+'Indexed')
    except:
        pass
    indexers = [StringIndexer(inputCol=stringType, outputCol=stringType+'Indexed') for stringType in stringTypes]
    oheTypes = [indexedType+'OneHotEncoded' for indexedType in indexedTypes]
    ohe = OneHotEncoderEstimator(inputCols=indexedTypes, outputCols=oheTypes)
    assembler = VectorAssembler(inputCols=oheTypes, outputCol='assembled')
    pipeline = Pipeline(stages=[*indexers, ohe, assembler])
#     assembler = VectorAssembler(inputCols=indexedTypes, outputCol='assembled')
#     pipeline = Pipeline(stages=[*indexers, assembler])
    
    indexed = pipeline.fit(df).transform(df)
    return stringTypes, oheTypes, indexed

_, indexedTypes, indexedDf = autoIndexer(train, labelCol)
# The null hypothesis is that the occurrence of the outcomes is statistically independent.
# In general, small p-values (1% to 5%) would cause you to reject the null hypothesis. 
# This very large p-value (92.65%) means that the null hypothesis should not be rejected.
testResult = ChiSquareTest.test(indexedDf, 'assembled', 'classIndexed')
r = testResult.head()
print("pValues: " + str(r.pValues))
print("degreesOfFreedom: " + str(r.degreesOfFreedom))
print("statistics: " + str(r.statistics))


In [None]:
testResult

In [None]:
from pyspark.ml.linalg import Vectors

df = spark.createDataFrame([(Vectors.dense(1.0, 0.0, 3.0), 
                             Vectors.dense(1.0, 0.0, 3.0), 
                             Vectors.dense(1.0, 0.0, 3.0))], 
                           ["a", "b", "c"])
vecAssembler = VectorAssembler(inputCols=["a", "b", "c"], outputCol="features")
df = vecAssembler.transform(df)
df.select("features").show()

In [None]:
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

stringTypes = [dtype[0] for dtype in train.dtypes if dtype[1] == 'string']
indexedTypes = [stringType+'Indexed' for stringType in stringTypes]

indexers = [StringIndexer(inputCol=stringType, outputCol=stringType+'Indexed') for stringType in stringTypes]
pipeline = Pipeline(stages=indexers)
train = pipeline.fit(train).transform(train)
train.show()

In [None]:
train.printSchema()

In [None]:
from pyspark.ml.feature import OneHotEncoderEstimator, VectorAssembler

oheTypes = [indexedType+'OneHotEncoded' for indexedType in indexedTypes]
ohe = OneHotEncoderEstimator(inputCols=indexedTypes, outputCols=oheTypes)
assembler = VectorAssembler(inputCols=oheTypes, outputCol='assembled')
pipeline = Pipeline(stages=[])
train = ohe.fit(train).transform(train)
train