In [64]:
import pandas as pd
import numpy as np
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.feature import StringIndexer
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml import Pipeline
from pyspark.sql.functions import when

In [2]:
inputDF = pd.read_csv("multipleChoiceResponses.csv")
inputDF.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,GenderSelect,Country,Age,EmploymentStatus,StudentStatus,LearningDataScience,CodeWriter,CareerSwitcher,CurrentJobTitleSelect,TitleFit,...,JobFactorExperienceLevel,JobFactorDepartment,JobFactorTitle,JobFactorCompanyFunding,JobFactorImpact,JobFactorRemote,JobFactorIndustry,JobFactorLeaderReputation,JobFactorDiversity,JobFactorPublishingOpportunity
0,"Non-binary, genderqueer, or gender non-conforming",,,Employed full-time,,,Yes,,DBA/Database Engineer,Fine,...,,,,,,,,,,
1,Female,United States,30.0,"Not employed, but looking for work",,,,,,,...,,,,,,,,Somewhat important,,
2,Male,Canada,28.0,"Not employed, but looking for work",,,,,,,...,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important,Very Important
3,Male,United States,56.0,"Independent contractor, freelancer, or self-em...",,,Yes,,Operations Research Practitioner,Poorly,...,,,,,,,,,,
4,Male,Taiwan,38.0,Employed full-time,,,Yes,,Computer Scientist,Fine,...,,,,,,,,,,


In [53]:
catColumns = ['GenderSelect','Country','EmploymentStatus']
smallDF = inputDF[catColumns]
dummyDF = pd.get_dummies(data=smallDF, columns=catColumns)

In [54]:
dummyDF.head()

Unnamed: 0,GenderSelect_A different identity,GenderSelect_Female,GenderSelect_Male,"GenderSelect_Non-binary, genderqueer, or gender non-conforming",Country_Argentina,Country_Australia,Country_Belarus,Country_Belgium,Country_Brazil,Country_Canada,...,Country_United Kingdom,Country_United States,Country_Vietnam,EmploymentStatus_Employed full-time,EmploymentStatus_Employed part-time,EmploymentStatus_I prefer not to say,"EmploymentStatus_Independent contractor, freelancer, or self-employed","EmploymentStatus_Not employed, and not looking for work","EmploymentStatus_Not employed, but looking for work",EmploymentStatus_Retired
0,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,0
2,0,0,1,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,0,0,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [43]:
sparkConf = SparkConf(loadDefaults=True).setAppName("Encoding")

In [44]:
spark = SparkSession.builder.master("local").config(conf=sparkConf).getOrCreate()

In [49]:
sparkDF = spark.read.csv('multipleChoiceResponses.csv',header=True)

In [72]:
sparkDF = sparkDF.select(catColumns)

In [73]:
for col in catColumns:
    sparkDF = sparkDF.withColumn(col, when(sparkDF[col].isNull(),"__NA__").otherwise(sparkDF[col]))

In [74]:
indexers = [StringIndexer(inputCol=col, outputCol=col+"_encoded").fit(sparkDF) for col in catColumns]

In [75]:
encoders = [OneHotEncoder(inputCol=col+"_encoded", outputCol=col+"_onehot") for col in catColumns]

In [76]:
p1 = Pipeline(stages=indexers)
indexedDF = p1.fit(sparkDF).transform(sparkDF)
p2 = Pipeline(stages=encoders)
encodedDF = p2.fit(indexedDF).transform(indexedDF)

In [80]:
encodedDF.select('Country_onehot').show()

+---------------+
| Country_onehot|
+---------------+
|(84,[21],[1.0])|
| (84,[0],[1.0])|
| (84,[9],[1.0])|
| (84,[0],[1.0])|
|(84,[13],[1.0])|
| (84,[6],[1.0])|
| (84,[0],[1.0])|
| (84,[1],[1.0])|
|(84,[10],[1.0])|
| (84,[3],[1.0])|
| (84,[3],[1.0])|
| (84,[1],[1.0])|
| (84,[6],[1.0])|
|(84,[15],[1.0])|
|(84,[13],[1.0])|
| (84,[0],[1.0])|
|(84,[14],[1.0])|
| (84,[4],[1.0])|
| (84,[0],[1.0])|
| (84,[6],[1.0])|
+---------------+
only showing top 20 rows



In [81]:
oneHotPandasDF = encodedDF.toPandas()

In [85]:
for col in catColumns:
    
    col_sep = '_'
    numCat = len(oneHotPandasDF[col+"_onehot"][0])
    recodedCols = []
    
    for i in range(0,numCat):
        recodedCols.append(col+"_onehot"+col_sep+str(i))
    
    tempDF = pd.DataFrame(np.asarray(list(oneHotPandasDF[col+"_onehot"].values)), columns=recodedCols)
    tempDF1 = oneHotPandasDF.drop(col+"_onehot", axis=1)
    
    finalDF = pd.concat([tempDF1,tempDF], axis=1)

In [86]:
finalDF.head()

Unnamed: 0,GenderSelect,Country,EmploymentStatus,GenderSelect_encoded,Country_encoded,EmploymentStatus_encoded,GenderSelect_onehot,Country_onehot,EmploymentStatus_onehot_0,EmploymentStatus_onehot_1,...,EmploymentStatus_onehot_14,EmploymentStatus_onehot_15,EmploymentStatus_onehot_16,EmploymentStatus_onehot_17,EmploymentStatus_onehot_18,EmploymentStatus_onehot_19,EmploymentStatus_onehot_20,EmploymentStatus_onehot_21,EmploymentStatus_onehot_22,EmploymentStatus_onehot_23
0,"Non-binary, genderqueer, or gender non-conforming",__NA__,Employed full-time,4.0,21.0,0.0,"(0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Female,United States,"Not employed, but looking for work",1.0,0.0,1.0,"(0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Male,Canada,"Not employed, but looking for work",0.0,9.0,1.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Male,United States,"Independent contractor, freelancer, or self-em...",0.0,0.0,2.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Male,Taiwan,Employed full-time,0.0,13.0,0.0,"(1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","(0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
