In [1]:
import findspark
findspark.init("/u/cs451/packages/spark")

In [2]:
from pyspark.sql import SparkSession
import random
spark = SparkSession.builder.appName("YourTest").master("local[2]").config('spark.ui.port', random.randrange(4000,5000)).getOrCreate()

In [10]:
spark.version

'2.3.1'

In [8]:
df = spark.read\
.option("wholeFile", True)\
.option("inferSchema", True)\
.option("header", True)\
.option("multiline", True)\
.option('quote', '"')\
.option('escape', '"')\
.csv("alldata.csv")
display(df)

DataFrame[position: string, company: string, description: string, reviews: int, location
: string]

In [30]:
df.printSchema()

root
 |-- position: string (nullable = true)
 |-- company: string (nullable = true)
 |-- description: string (nullable = true)
 |-- reviews: integer (nullable = true)
 |-- location: string (nullable = true)



In [35]:
type(df)

pyspark.sql.dataframe.DataFrame

In [36]:
df.count()

6964

In [38]:
len(df.columns), df.columns

(5, ['position', 'company', 'description', 'reviews', 'location\r'])

In [31]:
df.show(3)

+--------------------+------------------+--------------------+-------+------------------+
|            position|           company|         description|reviews|         location|
+--------------------+------------------+--------------------+-------+------------------+
|Development Director|           ALS TDI|Development Direc...|   null|Atlanta, GA 30301 |
|An Ostentatiously...|The Hexagon Lavish|Job Description

...|   null|       Atlanta, GA|
|      Data Scientist|    Xpert Staffing|Growing company l...|   null|       Atlanta, GA|
+--------------------+------------------+--------------------+-------+------------------+
only showing top 3 rows



In [39]:
df.describe().show()

+-------+--------------------+-----------------+--------------------+------------------+-----------------+
|summary|            position|          company|         description|           reviews|        location|
+-------+--------------------+-----------------+--------------------+------------------+-----------------+
|  count|                6953|             6953|                6953|              5326|             6964|
|   mean|                null|           1238.0|                null|3179.0891851295532|             null|
| stddev|                null|895.1971849821691|                null| 8448.515056351427|             null|
|    min|(Contract) Busine...|     10x Genomics|
2+ years of expe...|                 2|                |
|    max|senior data scien...|               쿠팡|职位:数据分析工程师工作地点:加州...|            148114|Yonkers, NY 10701|
+-------+--------------------+-----------------+--------------------+------------------+-----------------+



In [22]:
df.select('company').groupby('company').count().orderBy('count', ascending=0).show(5)

+------------------+-----+
|           company|count|
+------------------+-----+
|        Amazon.com|  358|
|    Ball Aerospace|  187|
|         Microsoft|  137|
|            Google|  134|
|NYU Langone Health|   77|
+------------------+-----+
only showing top 5 rows



In [25]:
df.select('position').groupby('position').count().orderBy('count', ascending=0).show(5)

+--------------------+-----+
|            position|count|
+--------------------+-----+
|      Data Scientist|  351|
|Senior Data Scien...|   96|
|    Research Analyst|   64|
|       Data Engineer|   60|
|Machine Learning ...|   56|
+--------------------+-----+
only showing top 5 rows



In [33]:
df.na.fill({'reviews': 0}).show(5)

+--------------------+------------------+--------------------+-------+------------------+
|            position|           company|         description|reviews|         location|
+--------------------+------------------+--------------------+-------+------------------+
|Development Director|           ALS TDI|Development Direc...|      0|Atlanta, GA 30301 |
|An Ostentatiously...|The Hexagon Lavish|Job Description

...|      0|       Atlanta, GA|
|      Data Scientist|    Xpert Staffing|Growing company l...|      0|       Atlanta, GA|
|        Data Analyst|    Operation HOPE|DEPARTMENT: Progr...|     44|Atlanta, GA 30303 |
|Assistant Profess...|  Emory University|DESCRIPTION
The E...|    550|       Atlanta, GA|
+--------------------+------------------+--------------------+-------+------------------+
only showing top 5 rows



In [34]:
df.select('reviews').show(5)

+-------+
|reviews|
+-------+
|   null|
|   null|
|   null|
|     44|
|    550|
+-------+
only showing top 5 rows



In [41]:
from pyspark.ml.feature import Tokenizer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="description", outputCol="words")

pipeline = Pipeline(stages=[tokenizer])

pipelineFit = pipeline.fit(df)
df = pipelineFit.transform(df)
#val_df = pipelineFit.transform(val_set)
#df.show(5)

+--------------------+------------------+--------------------+-------+------------------+--------------------+
|            position|           company|         description|reviews|         location|               words|
+--------------------+------------------+--------------------+-------+------------------+--------------------+
|Development Director|           ALS TDI|Development Direc...|   null|Atlanta, GA 30301 |[development, dir...|
|An Ostentatiously...|The Hexagon Lavish|Job Description

...|   null|       Atlanta, GA|[job, description...|
|      Data Scientist|    Xpert Staffing|Growing company l...|   null|       Atlanta, GA|[growing, company...|
|        Data Analyst|    Operation HOPE|DEPARTMENT: Progr...|     44|Atlanta, GA 30303 |[department:, pro...|
|Assistant Profess...|  Emory University|DESCRIPTION
The E...|    550|       Atlanta, GA|[description, the...|
+--------------------+------------------+--------------------+-------+------------------+--------------------+
o

In [44]:
df.select('words').take(2)

[Row(words=['development', 'director', 'als', 'therapy', 'development', 'institute', 'has', 'an', 'immediate', 'opening', 'for', 'development', 'directors.', 'reporting', 'directly', 'to', 'the', 'senior', 'development', 'director,', 'the', 'development', 'director', 'at', 'als', 'tdi', 'is', 'a', 'senior', 'fundraising', 'position', 'working', 'to', 'identifying', 'potential', 'prospects', 'and', 'cultivating', 'solicitation', 'strategies', 'and', 'in', 'closing', 'asks', 'with', 'donors', 'including', 'individuals', 'and', 'corporations', 'by', 'building', 'networks', 'via', 'events,', 'generating', 'awareness', 'of', 'als', 'tdi;', 'outreach', 'including', 'attending', 'and', 'speaking', 'at', 'events', 'as', 'well', 'as', 'personally', 'cultivates', 'relationships', 'with', 'patients,', 'prospects', 'and', 'donors.', 'this', 'position', 'will', 'be', 'responsible', 'for', 'generating', 'and', 'managing', 'a', 'portfolio', 'of', 'at', 'least', 'two', 'million', 'to', 'five', 'millio

In [11]:
import pandas as pd
df_pandas = pd.read_csv("alldata.csv")

In [12]:
df_pandas.head()

Unnamed: 0,position,company,description,reviews,location
0,Development Director,ALS TDI,Development Director\nALS Therapy Development ...,,"Atlanta, GA 30301"
1,An Ostentatiously-Excitable Principal Research...,The Hexagon Lavish,"Job Description\n\n""The road that leads to acc...",,"Atlanta, GA"
2,Data Scientist,Xpert Staffing,"Growing company located in the Atlanta, GA are...",,"Atlanta, GA"
3,Data Analyst,Operation HOPE,DEPARTMENT: Program OperationsPOSITION LOCATIO...,44.0,"Atlanta, GA 30303"
4,Assistant Professor -TT - Signal Processing & ...,Emory University,DESCRIPTION\nThe Emory University Department o...,550.0,"Atlanta, GA"
