### Spark Session Initiation

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Cat-2-adult-Project").getOrCreate()

### Reading Data

In [3]:
df=spark.read.csv("adult_data_new.csv",header=True,inferSchema=True)

In [4]:
df.show()

+---+-----------------+------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
|age|        workclass|fnlwgt|    education|education_num|      marital_status|        occupation|  relationship|               race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+---+-----------------+------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+
| 39|        State-gov| 77516|    Bachelors|           13|       Never-married|      Adm-clerical| Not-in-family|              White|   Male|        2174|           0|            40| United-States| <=50K|
| 50| Self-emp-not-inc| 83311|    Bachelors|           13|  Married-civ-spouse|   Exec-managerial|       Husband|              White|   Male|           0|           0|            1

In [5]:
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: integer (nullable = true)
 |-- capital_loss: integer (nullable = true)
 |-- hours_per_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)



In [6]:
des=df.describe()
des.show()

+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+------------+-------------------+-------+------------------+----------------+------------------+--------------+------+
|summary|               age|   workclass|            fnlwgt|    education|    education_num|marital_status|       occupation|relationship|               race|    sex|      capital_gain|    capital_loss|    hours_per_week|native_country|income|
+-------+------------------+------------+------------------+-------------+-----------------+--------------+-----------------+------------+-------------------+-------+------------------+----------------+------------------+--------------+------+
|  count|             32561|       32561|             32561|        32561|            32561|         32561|            32561|       32561|              32561|  32561|             32561|           32561|             32561|         32561| 32561|
|   mean| 38.58164675532

### Exploratory data analysis using SQL queries

In [7]:
df.createOrReplaceTempView("adultdata")

In [8]:
result1=spark.sql("SELECT workclass, avg(capital_gain) FROM adultdata GROUP BY workclass")

In [9]:
result1.show()

+-----------------+------------------+
|        workclass| avg(capital_gain)|
+-----------------+------------------+
|        State-gov| 701.6995377503852|
|      Federal-gov| 833.2322916666667|
| Self-emp-not-inc|1886.0617866981504|
|        Local-gov|  880.202580028667|
|          Private| 868.0810370128811|
|     Self-emp-inc| 4875.693548387097|
|      Without-pay|487.85714285714283|
|     Never-worked|               0.0|
+-----------------+------------------+



In [10]:
result2=spark.sql("SELECT workclass, avg(hours_per_week), avg(capital_loss),avg(capital_gain) FROM adultdata GROUP BY workclass")

In [11]:
result2.show()

+-----------------+-------------------+------------------+------------------+
|        workclass|avg(hours_per_week)| avg(capital_loss)| avg(capital_gain)|
+-----------------+-------------------+------------------+------------------+
|        State-gov|  39.03158705701079| 83.25654853620955| 701.6995377503852|
|      Federal-gov|  41.37916666666667|         112.26875| 833.2322916666667|
| Self-emp-not-inc| 44.421881149153876|116.63164108618655|1886.0617866981504|
|        Local-gov|  40.98279980888677|109.85427615862399|  880.202580028667|
|          Private|  39.64234469264634| 78.56815587803685| 868.0810370128811|
|     Self-emp-inc|  48.81810035842294|155.13888888888889| 4875.693548387097|
|      Without-pay| 32.714285714285715|               0.0|487.85714285714283|
|     Never-worked| 28.428571428571427|               0.0|               0.0|
+-----------------+-------------------+------------------+------------------+



In [12]:
result3=spark.sql("SELECT income,avg(hours_per_week), avg(capital_loss),avg(capital_gain) FROM adultdata GROUP BY income")

In [13]:
result3.show()

+------+-------------------+------------------+------------------+
|income|avg(hours_per_week)| avg(capital_loss)| avg(capital_gain)|
+------+-------------------+------------------+------------------+
|  >50K| 45.473026399693914|195.00153041703865| 4006.142456319347|
| <=50K| 38.840210355987054| 53.14292071197411|148.75246763754046|
+------+-------------------+------------------+------------------+



In [14]:
result4=spark.sql("SELECT occupation, avg(hours_per_week),avg(capital_gain),avg(capital_loss) FROM adultdata GROUP BY occupation")

In [15]:
result4.show()

+------------------+-------------------+------------------+------------------+
|        occupation|avg(hours_per_week)| avg(capital_gain)| avg(capital_loss)|
+------------------+-------------------+------------------+------------------+
|   Farming-fishing| 46.989939637826964| 589.7263581488934| 63.07545271629779|
| Handlers-cleaners| 37.947445255474456| 257.5729927007299|45.635766423357666|
|    Prof-specialty|  39.15828179842888|2072.9755975263247|112.84857095102791|
|      Adm-clerical|  37.55835543766578| 495.9549071618037|60.794429708222815|
|   Exec-managerial|   44.9877029021151|2262.7729955730447|138.83841613379244|
|      Craft-repair|  42.30422054159551| 649.5128080019517| 88.46523542327397|
|             Sales|  40.78109589041096| 1319.829315068493| 98.30054794520548|
|      Tech-support| 39.432112068965516| 673.5528017241379| 98.66594827586206|
|  Transport-moving|  44.65623043206011| 490.3237319974953| 81.48090169067001|
|   Protective-serv|  42.87057010785824| 708.0986132

In [16]:
result5=spark.sql("SELECT sex, avg(hours_per_week),avg(capital_gain),avg(capital_loss) FROM adultdata GROUP BY sex")

In [17]:
result5.show()

+-------+-------------------+------------------+------------------+
|    sex|avg(hours_per_week)| avg(capital_gain)| avg(capital_loss)|
+-------+-------------------+------------------+------------------+
|   Male|  42.42808627810923|1329.3700780174393|100.21330885727397|
| Female| 36.410361154953115| 568.4105468387336| 61.18763346021725|
+-------+-------------------+------------------+------------------+



In [18]:
result6=spark.sql("SELECT education, avg(hours_per_week),avg(capital_gain),avg(capital_loss) FROM adultdata GROUP BY education")

In [19]:
result6.show()

+-------------+-------------------+------------------+------------------+
|    education|avg(hours_per_week)| avg(capital_gain)| avg(capital_loss)|
+-------------+-------------------+------------------+------------------+
|  Prof-school|  47.42534722222222|10414.416666666666|        231.203125|
|         10th| 37.052518756698824|404.57449088960345|56.845659163987136|
|      7th-8th|  39.36687306501548|233.93962848297213|  65.6687306501548|
|      5th-6th|   38.8978978978979|176.02102102102103| 68.25225225225225|
|   Assoc-acdm| 40.504217432052485| 640.3992502343018| 93.41893158388004|
|    Assoc-voc|  41.61070911722142| 715.0513748191028| 72.75470332850941|
|      Masters|  43.83633197910621| 2562.563551944283|166.71967498549043|
|         12th|  35.78060046189376| 284.0877598152425| 32.33718244803695|
|    Preschool|  36.64705882352941| 898.3921568627451| 66.49019607843137|
|          9th|  38.04474708171206|342.08949416342415| 28.99805447470817|
|    Bachelors| 42.614005602240894| 17

In [20]:
from pyspark.sql.functions import isnan, when, count, col, isnull

In [21]:
df.select([count(when(isnull(c), c)).alias(c) for c in df.columns]).show()

+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
|age|workclass|fnlwgt|education|education_num|marital_status|occupation|relationship|race|sex|capital_gain|capital_loss|hours_per_week|native_country|income|
+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+
|  0|        0|     0|        0|            0|             0|         0|           0|   0|  0|           0|           0|             0|             0|     0|
+---+---------+------+---------+-------------+--------------+----------+------------+----+---+------------+------------+--------------+--------------+------+



In [22]:
from pyspark.sql.functions import countDistinct

In [23]:
df.select([countDistinct(c).alias(c) for c in ["workclass","education","marital_status","occupation","relationship","race","native_country"]]).show()

+---------+---------+--------------+----------+------------+----+--------------+
|workclass|education|marital_status|occupation|relationship|race|native_country|
+---------+---------+--------------+----------+------------+----+--------------+
|        8|       16|             7|        14|           6|   5|            41|
+---------+---------+--------------+----------+------------+----+--------------+



### String Indexing , One hot encoding and Vector Assembling

In [24]:
from pyspark.ml.feature import (VectorAssembler,VectorIndexer,OneHotEncoder,StringIndexer)

In [25]:
workclass_indexer = StringIndexer(inputCol='workclass',outputCol='workclass_Index')
df=workclass_indexer.fit(df).transform(df)
workclass_encoder = OneHotEncoder(inputCol='workclass_Index',outputCol='workclass_Vec')
workclass_model=workclass_encoder.fit(df)
df=workclass_model.transform(df)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: integer (nullable = true)
 |-- capital_loss: integer (nullable = true)
 |-- hours_per_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- workclass_Index: double (nullable = false)
 |-- workclass_Vec: vector (nullable = true)



In [26]:
df.select("workclass_Vec").show()

+-------------+
|workclass_Vec|
+-------------+
|(7,[3],[1.0])|
|(7,[1],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[1],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[3],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[1],[1.0])|
|(7,[0],[1.0])|
|(7,[0],[1.0])|
|(7,[1],[1.0])|
+-------------+
only showing top 20 rows



In [27]:
education_indexer = StringIndexer(inputCol='education',outputCol='education_Index')
df=education_indexer.fit(df).transform(df)
education_encoder = OneHotEncoder(inputCol='education_Index',outputCol='education_Vec')
education_model=education_encoder.fit(df)
df=education_model.transform(df)
df.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: integer (nullable = true)
 |-- capital_loss: integer (nullable = true)
 |-- hours_per_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- workclass_Index: double (nullable = false)
 |-- workclass_Vec: vector (nullable = true)
 |-- education_Index: double (nullable = false)
 |-- education_Vec: vector (nullable = true)



In [28]:
df.select("education_Vec").show()

+---------------+
|  education_Vec|
+---------------+
| (15,[2],[1.0])|
| (15,[2],[1.0])|
| (15,[0],[1.0])|
| (15,[5],[1.0])|
| (15,[2],[1.0])|
| (15,[3],[1.0])|
|(15,[10],[1.0])|
| (15,[0],[1.0])|
| (15,[3],[1.0])|
| (15,[2],[1.0])|
| (15,[1],[1.0])|
| (15,[2],[1.0])|
| (15,[2],[1.0])|
| (15,[6],[1.0])|
| (15,[4],[1.0])|
| (15,[8],[1.0])|
| (15,[0],[1.0])|
| (15,[0],[1.0])|
| (15,[5],[1.0])|
| (15,[3],[1.0])|
+---------------+
only showing top 20 rows



In [29]:
marital_status_indexer = StringIndexer(inputCol='marital_status',outputCol='marital_status_Index')
df=marital_status_indexer.fit(df).transform(df)
marital_status_encoder = OneHotEncoder(inputCol='marital_status_Index',outputCol='marital_status_Vec')
marital_status_model=marital_status_encoder.fit(df)
df=marital_status_model.transform(df)
df.show()

+---+-----------------+------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+---------------+-------------+---------------+---------------+--------------------+------------------+
|age|        workclass|fnlwgt|    education|education_num|      marital_status|        occupation|  relationship|               race|    sex|capital_gain|capital_loss|hours_per_week|native_country|income|workclass_Index|workclass_Vec|education_Index|  education_Vec|marital_status_Index|marital_status_Vec|
+---+-----------------+------+-------------+-------------+--------------------+------------------+--------------+-------------------+-------+------------+------------+--------------+--------------+------+---------------+-------------+---------------+---------------+--------------------+------------------+
| 39|        State-gov| 77516|    Bachelors|           13|       Never-married|

In [30]:
occupation_indexer = StringIndexer(inputCol='occupation',outputCol='occupation_Index')
df=occupation_indexer.fit(df).transform(df)
occupation_encoder = OneHotEncoder(inputCol='occupation_Index',outputCol='occupation_Vec')
occupation_model=occupation_encoder.fit(df)
df=occupation_model.transform(df)

In [31]:
relationship_indexer = StringIndexer(inputCol='relationship',outputCol='relationship_Index')
df=relationship_indexer.fit(df).transform(df)
relationship_encoder = OneHotEncoder(inputCol='relationship_Index',outputCol='relationship_Vec')
relationship_model=relationship_encoder.fit(df)
df=relationship_model.transform(df)

In [32]:
race_indexer = StringIndexer(inputCol='race',outputCol='race_Index')
df=race_indexer.fit(df).transform(df)
race_encoder = OneHotEncoder(inputCol='race_Index',outputCol='race_Vec')
race_model=race_encoder.fit(df)
df=race_model.transform(df)

In [33]:
sex_indexer = StringIndexer(inputCol='sex',outputCol='sex_Index')
df=sex_indexer.fit(df).transform(df)
sex_encoder = OneHotEncoder(inputCol='sex_Index',outputCol='sex_Vec')
sex_model=sex_encoder.fit(df)
df=sex_model.transform(df)

In [34]:
native_country_indexer = StringIndexer(inputCol='native_country',outputCol='native_country_Index')
df=native_country_indexer.fit(df).transform(df)
native_country_encoder = OneHotEncoder(inputCol='native_country_Index',outputCol='native_country_Vec')
native_country_model=native_country_encoder.fit(df)
df=native_country_model.transform(df)

In [35]:
income_indexer = StringIndexer(inputCol='income',outputCol='income_Index')
df=income_indexer.fit(df).transform(df)


In [36]:
df.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'income',
 'workclass_Index',
 'workclass_Vec',
 'education_Index',
 'education_Vec',
 'marital_status_Index',
 'marital_status_Vec',
 'occupation_Index',
 'occupation_Vec',
 'relationship_Index',
 'relationship_Vec',
 'race_Index',
 'race_Vec',
 'sex_Index',
 'sex_Vec',
 'native_country_Index',
 'native_country_Vec',
 'income_Index']

In [37]:
assembler = VectorAssembler(inputCols=[
 'workclass_Vec','age','fnlwgt','education_Vec','education_num','marital_status_Vec','capital_gain', 'capital_loss', 'hours_per_week', 'occupation_Vec',
    'relationship_Vec','race_Vec','sex_Vec','native_country_Vec',
],outputCol='features')

In [38]:
df_assembled = assembler.transform(df)

In [39]:
df_assembled.columns

['age',
 'workclass',
 'fnlwgt',
 'education',
 'education_num',
 'marital_status',
 'occupation',
 'relationship',
 'race',
 'sex',
 'capital_gain',
 'capital_loss',
 'hours_per_week',
 'native_country',
 'income',
 'workclass_Index',
 'workclass_Vec',
 'education_Index',
 'education_Vec',
 'marital_status_Index',
 'marital_status_Vec',
 'occupation_Index',
 'occupation_Vec',
 'relationship_Index',
 'relationship_Vec',
 'race_Index',
 'race_Vec',
 'sex_Index',
 'sex_Vec',
 'native_country_Index',
 'native_country_Vec',
 'income_Index',
 'features']

In [40]:
df_assembled.select("features", "income_Index").show()

+--------------------+------------+
|            features|income_Index|
+--------------------+------------+
|(97,[3,7,8,11,24,...|         0.0|
|(97,[1,7,8,11,24,...|         0.0|
|(97,[0,7,8,9,24,2...|         0.0|
|(97,[0,7,8,14,24,...|         0.0|
|(97,[0,7,8,11,24,...|         0.0|
|(97,[0,7,8,12,24,...|         0.0|
|(97,[0,7,8,19,24,...|         0.0|
|(97,[1,7,8,9,24,2...|         1.0|
|(97,[0,7,8,12,24,...|         1.0|
|(97,[0,7,8,11,24,...|         1.0|
|(97,[0,7,8,10,24,...|         1.0|
|(97,[3,7,8,11,24,...|         1.0|
|(97,[0,7,8,11,24,...|         0.0|
|(97,[0,7,8,15,24,...|         0.0|
|(97,[0,7,8,13,24,...|         1.0|
|(97,[0,7,8,17,24,...|         0.0|
|(97,[1,7,8,9,24,2...|         0.0|
|(97,[0,7,8,9,24,2...|         0.0|
|(97,[0,7,8,14,24,...|         0.0|
|(97,[1,7,8,12,24,...|         1.0|
+--------------------+------------+
only showing top 20 rows



### Scaling

In [41]:
from pyspark.ml.feature import StandardScaler

In [42]:
scaler = StandardScaler(inputCol="features",outputCol="scaledFeatures")

In [43]:
scalerModel = scaler.fit(df_assembled)

In [44]:
df_assembled_scaled = scalerModel.transform(df_assembled)

In [45]:
df_assembled_scaled.printSchema()

root
 |-- age: integer (nullable = true)
 |-- workclass: string (nullable = true)
 |-- fnlwgt: integer (nullable = true)
 |-- education: string (nullable = true)
 |-- education_num: integer (nullable = true)
 |-- marital_status: string (nullable = true)
 |-- occupation: string (nullable = true)
 |-- relationship: string (nullable = true)
 |-- race: string (nullable = true)
 |-- sex: string (nullable = true)
 |-- capital_gain: integer (nullable = true)
 |-- capital_loss: integer (nullable = true)
 |-- hours_per_week: integer (nullable = true)
 |-- native_country: string (nullable = true)
 |-- income: string (nullable = true)
 |-- workclass_Index: double (nullable = false)
 |-- workclass_Vec: vector (nullable = true)
 |-- education_Index: double (nullable = false)
 |-- education_Vec: vector (nullable = true)
 |-- marital_status_Index: double (nullable = false)
 |-- marital_status_Vec: vector (nullable = true)
 |-- occupation_Index: double (nullable = false)
 |-- occupation_Vec: vector (n

In [46]:
df_assembled_scaled.select("scaledFeatures").show()

+--------------------+
|      scaledFeatures|
+--------------------+
|(97,[3,7,8,11,24,...|
|(97,[1,7,8,11,24,...|
|(97,[0,7,8,9,24,2...|
|(97,[0,7,8,14,24,...|
|(97,[0,7,8,11,24,...|
|(97,[0,7,8,12,24,...|
|(97,[0,7,8,19,24,...|
|(97,[1,7,8,9,24,2...|
|(97,[0,7,8,12,24,...|
|(97,[0,7,8,11,24,...|
|(97,[0,7,8,10,24,...|
|(97,[3,7,8,11,24,...|
|(97,[0,7,8,11,24,...|
|(97,[0,7,8,15,24,...|
|(97,[0,7,8,13,24,...|
|(97,[0,7,8,17,24,...|
|(97,[1,7,8,9,24,2...|
|(97,[0,7,8,9,24,2...|
|(97,[0,7,8,14,24,...|
|(97,[1,7,8,12,24,...|
+--------------------+
only showing top 20 rows



In [47]:
final_data = df_assembled_scaled.select("scaledFeatures", "income_Index")

In [48]:
train_data,test_data = final_data.randomSplit([0.7,0.3])

### Logistic Regression

In [49]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(labelCol='income_Index',featuresCol="scaledFeatures")

In [50]:
lrModel = lr.fit(train_data)

In [51]:
print("Coefficients: \n{} \nIntercept: \n{}".format(lrModel.coefficients,lrModel.intercept))

Coefficients: 
[-0.9824086758898924,-0.7419872626158511,-0.5696549640009633,-0.4872050015671399,-0.3622481980005578,-0.2931993894038852,-0.2414035052402708,0.3211477188318392,0.059304112937285755,0.4741803933646044,0.3397726737672524,0.05916998270565335,-0.002782861330639354,0.1129485725229431,0.2662560752687501,0.007707666570440325,0.2790233382910743,0.29791585059551207,0.002569840768920795,0.2741688568854555,0.10113237033482995,-0.03818334571910119,0.2997229801375503,0.2425372619393289,1.3457251587502115,-0.4565859177120672,-1.5133056229207147,-0.92623978777601,-0.48605925296667624,-0.45761778185378427,-0.28426503699603584,2.1504157161292037,0.25517000712829485,0.39382396317983553,-0.19579704915140617,-0.21352510731089314,0.01918611464940211,-0.2319230629567449,-0.13811599958808637,-0.46862212386863883,-0.21615167607286626,-0.1631074407606077,-0.27679077415186215,-0.29339477109512335,-0.009100706718297167,-0.019754254171037884,-0.3125841957731438,0.3242494546879176,0.3433970085067035

In [52]:
train_results=lrModel.transform(train_data)
test_results = lrModel.transform(test_data)

In [53]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

In [54]:
lr_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='income_Index')

In [55]:
AUC_train = lr_eval.evaluate(train_results)
AUC_test = lr_eval.evaluate(test_results)

In [56]:
print("Training accuracy for Logistic Regression is {}".format(AUC_train))
print("Test accuracy for Logistic Regression is {}".format(AUC_test))

Training accuracy for Logistic Regression is 0.7653573049612654
Test accuracy for Logistic Regression is 0.7654543626196985


### Linear SVC

In [57]:
from pyspark.ml.classification import LinearSVC

In [58]:
lsvc = LinearSVC(maxIter=10, regParam=0.1,labelCol="income_Index",featuresCol="scaledFeatures")

In [59]:
lsvcModel=lsvc.fit(train_data)

In [60]:
print("Coefficients: \n{} \nIntercept: \n{}".format(lsvcModel.coefficients,lsvcModel.intercept))

Coefficients: 
[-0.05398949212795263,-0.042739332053325335,-0.028621306157245063,-0.030755987528638645,0.054231340029988674,-0.002395048290037922,-0.004461474675371362,-0.025822317041086278,-0.02409129725042768,-0.07431102024310013,-0.040829173116395626,0.05934529474239168,0.09059722755393178,-0.00872207151916139,-0.03627057921781257,-0.004870384598170359,-0.037425823713904065,-0.04692088519296775,0.10113587002890148,-0.031638873976989765,-0.020014774609857462,0.0923644872590346,-0.028427412523941025,-0.02177526466444487,-0.019779236301192802,0.02751699882578852,-0.09520307116740967,-0.032005896974550266,-0.018486419477346683,-0.015672628089233257,-0.013920710266179828,0.19174011186534876,0.08348650123948233,-0.006031612901741015,0.003034721817812371,-0.04240443945491047,0.08292910717202932,-0.02575055753045966,-0.004101142632983752,-0.042789010725545674,-0.02951365937615953,-0.026108319739360335,-0.028699618337521177,-0.03137972735831251,0.0010295901251363949,-0.004433033665302738,-0.

In [61]:
train_results_lsvc =lsvcModel.transform(train_data)
test_results_lsvc = lsvcModel.transform(test_data)

In [62]:
lsvc_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',
                                       labelCol='income_Index')

In [63]:
AUC_train_lsvc = lsvc_eval.evaluate(train_results)
AUC_test_lsvc = lsvc_eval.evaluate(test_results)

In [64]:
print("Training accuracy for Linear support vector classifier is {}".format(AUC_train_lsvc))
print("Test accuracy for Linear support vector classifier is {}".format(AUC_test_lsvc))

Training accuracy for Linear support vector classifier is 0.7653573049612654
Test accuracy for Linear support vector classifier is 0.7654543626196985


### Decision tree

In [65]:
from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [66]:
dtree = DecisionTreeClassifier(labelCol="income_Index", featuresCol="scaledFeatures")

In [67]:
dtreeModel = dtree.fit(train_data)

In [68]:
train_results_dtree = dtreeModel.transform(train_data)

In [69]:
train_results_dtree.select("prediction", "income_Index", "scaledFeatures").show(5)

+----------+------------+--------------------+
|prediction|income_Index|      scaledFeatures|
+----------+------------+--------------------+
|       1.0|         1.0|(97,[0,7,8,9,24,2...|
|       0.0|         0.0|(97,[0,7,8,9,24,2...|
|       1.0|         1.0|(97,[0,7,8,9,24,2...|
|       1.0|         1.0|(97,[0,7,8,9,24,2...|
|       0.0|         0.0|(97,[0,7,8,9,24,2...|
+----------+------------+--------------------+
only showing top 5 rows



In [70]:
test_results_dtree = dtreeModel.transform(test_data)

In [71]:
test_results_dtree.select("prediction", "income_Index", "scaledFeatures").show(5)

+----------+------------+--------------------+
|prediction|income_Index|      scaledFeatures|
+----------+------------+--------------------+
|       0.0|         1.0|(97,[0,7,8,9,24,2...|
|       0.0|         0.0|(97,[0,7,8,9,24,2...|
|       0.0|         0.0|(97,[0,7,8,9,24,2...|
|       0.0|         1.0|(97,[0,7,8,9,24,2...|
|       1.0|         1.0|(97,[0,7,8,9,24,2...|
+----------+------------+--------------------+
only showing top 5 rows



In [72]:
dtree_evaluator = MulticlassClassificationEvaluator(
    labelCol="income_Index", predictionCol="prediction", metricName="accuracy")
accuracy_train = dtree_evaluator.evaluate(train_results_dtree)
accuracy_test = dtree_evaluator.evaluate(test_results_dtree)

In [73]:
print("Training accuracy for decision tree is %g " % (accuracy_train))
print("Test accuracy for decision tree is %g " % (accuracy_test))

Training accuracy for decision tree is 0.839551 
Test accuracy for decision tree is 0.83959 


### Random Forest

In [74]:
from pyspark.ml.classification import RandomForestClassifier

In [75]:
rfc = RandomForestClassifier(labelCol="income_Index", featuresCol="scaledFeatures",numTrees=200,maxDepth=15)

In [76]:
rfc_model = rfc.fit(train_data)

In [77]:
train_results_rfc = rfc_model.transform(train_data)

In [78]:
test_results_rfc = rfc_model.transform(test_data)

In [79]:
rfc_evaluator = MulticlassClassificationEvaluator(
    labelCol="income_Index", predictionCol="prediction", metricName="accuracy")
accuracy_rfc_train = rfc_evaluator.evaluate(train_results_rfc)
accuracy_rfc_test = rfc_evaluator.evaluate(test_results_rfc)

In [80]:

print("Training accuracy for Random forest with 200 trees is %g " % (accuracy_rfc_train))
print("Trest accuracy for Random forest with 200 trees is %g " % (accuracy_rfc_test))

Training accuracy for Random forest with 200 trees is 0.876151 
Trest accuracy for Random forest with 200 trees is 0.853693 


### Gradient Boost Classifier

In [81]:
from pyspark.ml.classification import GBTClassifier

In [82]:
gbt = GBTClassifier(labelCol="income_Index", featuresCol="scaledFeatures",maxDepth=15)

In [83]:
gbtModel=gbt.fit(train_data)

In [84]:
train_results_gbt = gbtModel.transform(train_data)

In [85]:
test_results_gbt = gbtModel.transform(test_data)

In [86]:
gbt_evaluator = MulticlassClassificationEvaluator(
    labelCol="income_Index", predictionCol="prediction", metricName="accuracy")
accuracy_gbt_train = gbt_evaluator.evaluate(train_results_gbt)
accuracy_gbt_test = gbt_evaluator.evaluate(test_results_gbt)

In [87]:
print("Training accuracy for gradient boosted tree is %g " % (accuracy_gbt_train))
print("Test accuracy for gradient boosted tree  %g " % (accuracy_gbt_test))

Training accuracy for gradient boosted tree is 0.960141 
Test accuracy for gradient boosted tree  0.829343 
