# Introduction
In the following notebook, we will use Pyspark to perform various analyses on our salary-related data for *Science, Technology, Engi

In [2]:
import findspark
findspark.init()

In [3]:
import pyspark
from pyspark.sql import SparkSession, Row

In [5]:
spark = pyspark.sql.SparkSession.builder.master("local").getOrCreate()
spark

In [7]:
# Let's define our schema
schema = StructType([\
    StructField("timestamp", StringType(), True),\
    StructField("company", StringType(), True),\
    StructField("level", StringType(), True),\
    StructField("title", StringType(), True),\
    StructField("totalyearlycompensation", IntegerType(), False),\
    StructField("location", StringType(), True),\
    StructField("yearsofexperience", FloatType(), False),\
    StructField("yearsatcompany", FloatType(), False),\
    StructField("tag", StringType(), True),\
    StructField("basesalary", IntegerType(), False),\
    StructField("stockgrantvalue", IntegerType(), False),\
    StructField("bonus", IntegerType(), False),\
    StructField("gender", StringType(), True),\
    StructField("cityid", StringType(), True),\
    StructField("dmaid", StringType(), True),\
    StructField("race", StringType(), True),\
    StructField("education", StringType(), True)])

# Load and parse the data file, converting it to a DataFrame.
data = spark.read.format("csv")\
    .option("header", "false")\
    .option("delimiter", "\t")\
    .schema(schema)\
    .load("data/cleaned.txt")
data.show(n=5)

+------------------+---------+-----+--------------------+-----------------------+-----------------+-----------------+--------------+---+----------+---------------+-----+------+------+-----+----+---------+
|         timestamp|  company|level|               title|totalyearlycompensation|         location|yearsofexperience|yearsatcompany|tag|basesalary|stockgrantvalue|bonus|gender|cityid|dmaid|race|education|
+------------------+---------+-----+--------------------+-----------------------+-----------------+-----------------+--------------+---+----------+---------------+-----+------+------+-----+----+---------+
|  06/07/2017 11:33|   ORACLE|   L3|     Product Manager|                 127000| Redwood City, CA|              1.5|           1.5| NA|    107000|          20000|10000|    NA|  7392|  807|  NA|       NA|
|  06/10/2017 17:11|     EBAY| SE 2|   Software Engineer|                 100000|San Francisco, CA|              5.0|           3.0| NA|    141907|              0|    0|    NA|  74

[Stage 0:>                                                          (0 + 1) / 1]                                                                                

In [10]:
data.columns

['timestamp',
 'company',
 'level',
 'title',
 'totalyearlycompensation',
 'location',
 'yearsofexperience',
 'yearsatcompany',
 'tag',
 'basesalary',
 'stockgrantvalue',
 'bonus',
 'gender',
 'cityid',
 'dmaid',
 'race',
 'education']

## Correlation

In [14]:
# From stackoverflow
data.stat.corr("totalyearlycompensation","basesalary")



0.7576764284758558

In [22]:
# From stackoverflow
data.stat.corr("totalyearlycompensation","basesalary", "pearson")

0.7576764284758558

In [23]:
# From stackoverflow
data.stat.corr("totalyearlycompensation","basesalary", "spearman")

ValueError: Currently only the calculation of the Pearson Correlation coefficient is supported.

## Dataframe Generic Guide

In [27]:
data.describe("basesalary")

DataFrame[summary: string, basesalary: string]

In [28]:
summary = data.describe("basesalary")
summary.show()

+-------+------------------+
|summary|        basesalary|
+-------+------------------+
|  count|             62642|
|   mean|141906.68246543853|
| stddev| 55251.72216070826|
|    min|              1000|
|    max|           1659870|
+-------+------------------+

