In [179]:
import pyspark
import csv

In [180]:
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.types import StructField, StructType, StringType, LongType, IntegerType, FloatType
from pyspark.sql.functions import lower, col, column, regexp_replace, concat_ws, udf
from pyspark.sql.functions import expr, size
from pyspark.sql.functions import split, trim
from pyspark.sql import Row

In [181]:
ss=SparkSession.builder.appName("Semdis").getOrCreate()

In [182]:
text_DF = ss.read.csv("/storage/home/sqs6406/StoryResults.csv", header=True, inferSchema=True)
text_DF.printSchema()
text_DF.first()

root
 |-- ID: string (nullable = true)
 |-- EnglishFirstLanguage: string (nullable = true)
 |-- Story: string (nullable = true)
 |-- Rater1: integer (nullable = true)
 |-- Rater2: integer (nullable = true)
 |-- Rater3: integer (nullable = true)
 |-- Rater4: double (nullable = true)
 |-- Rater5: integer (nullable = true)
 |-- RaterMean: double (nullable = true)



Row(ID='1', EnglishFirstLanguage='n', Story='My lover has left. I miss him so much. I write a letter with his favorite stamp pattern. I send it to an unknown place where he may stay.', Rater1=3, Rater2=3, Rater3=1, Rater4=2.0, Rater5=3, RaterMean=2.4)

In [183]:
text_DF.select("Story").show(1)

+--------------------+
|               Story|
+--------------------+
|My lover has left...|
+--------------------+
only showing top 1 row



In [184]:
Story_DF = text_DF.select("Story")
Story_DF.show(2)

+--------------------+
|               Story|
+--------------------+
|My lover has left...|
|My mom was sendin...|
+--------------------+
only showing top 2 rows



In [185]:
def lower_clean_str(x):
  punc = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
  lowercased_str = x.lower()
  for ch in punc:
    lowercased_str = lowercased_str.replace(ch, '')
  return lowercased_str
def removePunctuation(column):
     return trim(lower(regexp_replace("Story",'[^\sa-zA-Z]', ''))).alias('Story')
Trim_DF = Story_DF.withColumn("Story", trim(col("Story")))
Strip_DF = Trim_DF.select(removePunctuation(col('Story')))
#Strip_DF = Trim_DF.rdd.map(lower_clean_str)
Strip_DF.show()

+--------------------+
|               Story|
+--------------------+
|my lover has left...|
|my mom was sendin...|
|i had to buy a st...|
|i decided it was ...|
|rachel was sittin...|
|she finally found...|
|once upon a time ...|
|i was trying to s...|
|kristen wanted to...|
|last summer i wro...|
|the mob boss had ...|
|i used a stamp wi...|
|when i received a...|
|i want to send a ...|
|david needed to s...|
|i am living in en...|
|i ran into a dile...|
|she misses him so...|
|in a day in summe...|
|the parent was se...|
+--------------------+
only showing top 20 rows



In [186]:
Story_DF2 = Strip_DF.select(split(col("Story")," ").alias("Story"))
Story_DF2.show()

+--------------------+
|               Story|
+--------------------+
|[my, lover, has, ...|
|[my, mom, was, se...|
|[i, had, to, buy,...|
|[i, decided, it, ...|
|[rachel, was, sit...|
|[she, finally, fo...|
|[once, upon, a, t...|
|[i, was, trying, ...|
|[kristen, wanted,...|
|[last, summer, i,...|
|[the, mob, boss, ...|
|[i, used, a, stam...|
|[when, i, receive...|
|[i, want, to, sen...|
|[david, needed, t...|
|[i, am, living, i...|
|[i, ran, into, a,...|
|[she, misses, him...|
|[in, a, day, in, ...|
|[the, parent, was...|
+--------------------+
only showing top 20 rows



In [187]:
Count_DF = Story_DF2.select('*', size("Story").alias("Story_Length"))
Count_DF.show()
max_value = Count_DF.agg({"Story_Length": "max"}).collect()[0][0]
print(max_value)

+--------------------+------------+
|               Story|Story_Length|
+--------------------+------------+
|[my, lover, has, ...|          29|
|[my, mom, was, se...|          60|
|[i, had, to, buy,...|          45|
|[i, decided, it, ...|          79|
|[rachel, was, sit...|          69|
|[she, finally, fo...|          39|
|[once, upon, a, t...|          97|
|[i, was, trying, ...|          43|
|[kristen, wanted,...|          56|
|[last, summer, i,...|          65|
|[the, mob, boss, ...|          62|
|[i, used, a, stam...|          59|
|[when, i, receive...|          47|
|[i, want, to, sen...|          50|
|[david, needed, t...|          16|
|[i, am, living, i...|          96|
|[i, ran, into, a,...|         104|
|[she, misses, him...|          41|
|[in, a, day, in, ...|          48|
|[the, parent, was...|          39|
+--------------------+------------+
only showing top 20 rows

104


In [188]:
Split_DF = Count_DF.select([Count_DF["Story"][k] for k in range(max_value)])

In [189]:
Split_DF.show()

+--------+--------+--------+--------+---------+--------+---------+--------+------------+---------+---------+-----------+---------+---------+---------+---------+---------+---------+---------+-----------+---------+---------+-------------+---------+-----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+---------+---------+-----------+---------+---------+---------+---------+---------+---------+-----------+---------+---------+-----------+---------+---------+---------+---------+---------+------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+----------+---------+---------+---------+---------+---------+---------+-----------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+------------+---------+---------+---------+------

In [194]:
Split_DF.write.csv('Words.csv')