In [None]:
sc

 #  RDD API Examples

## Word Count
In this example, we use a few transformations to build a dataset of (String, Int) pairs called counts and then save it to a file.
```
sc.textFile(name, minPartitions=None, use_unicode=True)
Read a text file from HDFS, a local file system (available on all nodes), or any Hadoop-supported file system URI, and return it as an RDD of Strings.
```


In [None]:
import os

#text_file = sc.textFile(os.getcwd()+"/../datasets/quijote.txt")
# To avoid copying a local file to all workers

lines = []
with open('../datasets/quijote.txt') as my_file:
    for line in my_file:
        lines.append(line)
text_file = sc.parallelize(lines)


counts = text_file.flatMap(lambda line: line.split(" ")) \
             .map(lambda word: (word, 1)) \
             .reduceByKey(lambda a, b: a + b)
counts = counts.sortBy(lambda a: a[1], ascending=False)
#NOTE: sortBy is not as efficient as sortByKey since it involves keying by the values,
#sorting by the keys, and then grabbing the values 
counts.take(50)

#counts.saveAsTextFile(os.path.join("/notebooks/","quixote-counts.txt"))


## Pi Estimation

Spark can also be used for compute-intensive tasks. This code estimates pi by "throwing darts" at a circle. We pick random points in the unit square ((0, 0) to (1,1)) and see how many fall in the unit circle. The fraction should be pi / 4, so we use this to get our estimate.

In [None]:
import random

NUM_SAMPLES=12000000

def inside(p):
    x, y = random.random(), random.random()
    return x*x + y*y < 1

count = sc.parallelize(range(0, NUM_SAMPLES)) \
             .filter(inside).count()
print ("Pi is roughly {}".format(4.0 * count / NUM_SAMPLES))

# DataFrame API Examples

### Testing Conversion to/from Pandas with arrow

In [None]:
# Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

# Generate a Pandas DataFrame
pdf = pd.DataFrame(np.random.rand(100, 3))

# Create a Spark DataFrame from a Pandas DataFrame using Arrow
df = spark.createDataFrame(pdf)

# Convert the Spark DataFrame back to a Pandas DataFrame using Arrow
result_pdf = df.select("*").toPandas()


In this example, we count al quijote lines mentioning Dulcinea.

In [None]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col


# Creates a DataFrame having a single column named "line"
df = text_file.map(lambda r: Row(r)).toDF(["line"])
dulcinea_lines = df.filter(col("line").like("%Dulcinea%"))
# Counts all the Dulcinea lines
print("There are {} lines with 'Dulcinea'".format(dulcinea_lines.count()))
# Counts lines mentioning Dulcinea and Quijote
print("There are {} lines with 'Dulcinea' and 'Quijote'".format(
    dulcinea_lines.filter(col("line").like("%Quijote%")).count()))
# Fetches the lines as an array of strings
dulcinea_lines.filter(col("line").like("%Quijote%")).collect()

### Exploring the superheroes dataset

In [None]:
from pyspark.sql.types import *

# To avoid copying a local file to all workers we create pandas dataframe at driver and convert to spark dataframe
# To Enable Arrow-based columnar data transfers
spark.conf.set("spark.sql.execution.arrow.enabled", "true")
superhero_pdf = pd.read_csv("../datasets/superheroes_info.csv",index_col='Index')


# We explicitly set schema to avoid problems with mapping pandas NaN Strings to SparkDataframe
# If not set,  Spark will try to convert NaN to DoubleType wiht error -> Can not merge type <class 'pyspark.sql.types.StringType'> and <class 'pyspark.sql.types.DoubleType'>
mapping = {'object': StringType, 'float64': FloatType}
superhero_df = spark.createDataFrame(superhero_pdf, schema= StructType( [StructField(name, mapping[dtype.name]()) for name,dtype in superhero_pdf.dtypes.iteritems() ]))

superhero_df.show(10)


In [None]:
from pyspark.sql.functions import isnan, when, count, col
df=superhero_df 
publisher_df = superhero_df.groupby("Publisher").count().show()

### Spark SQL Example

In [None]:
superhero_df.createOrReplaceTempView("superhero_table")
spark.sql("select Name,Gender,Status from superhero_table").show()