In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import *
from pyspark.ml import Pipeline

In [3]:
# instantiate a spark context object
appname= "large_read_tar"
master="local"

# Create Spark Session
spark = SparkSession.builder.appName(appname).getOrCreate()

In [6]:
from pyspark.sql import Row
from pyspark.sql.functions import *
df_test = spark.createDataFrame(
    [Row(col1="abc12edf", col2=[('a', 14),('b',10)]), 
     Row(col1="ddddd12bldasd", col2=[('a',26),('c',20)])])
df_test.show()

+-------------+----------------+
|         col1|            col2|
+-------------+----------------+
|     abc12edf|[[a,14], [b,10]]|
|ddddd12bldasd|[[a,26], [c,20]]|
+-------------+----------------+



In [3]:
rdd = sc.parallelize([(1,'a'), (1,'b'), (2,'c'), (3,'d'),
                      (5,'e'), (8,'&'),(8,'&'),(8,'&')
                     ])
r = sc.parallelize([("a", 1), ("b", 1), ("a", 1),123])
r.collect()
# reduceByKey and lambda function is your value set
a =rdd.reduceByKey(lambda x, y: x+y)
a.take(1)

[(8, '&&&')]

In [81]:
k_rdd = rdd.keyBy(lambda x: x[1])
k_rdd.collect()

[('a', (1, 'a')),
 ('b', (1, 'b')),
 ('c', (2, 'c')),
 ('d', (3, 'd')),
 ('e', (5, 'e')),
 ('&', (8, '&')),
 ('&', (8, '&')),
 ('&', (8, '&'))]

In [90]:
from operator import add
k_rdd.reduceByKey( lambda x,y: ( x[0]+y[0], x[1] + y[1])).collect()

[('a', (1, 'a')),
 ('c', (2, 'c')),
 ('e', (5, 'e')),
 ('b', (1, 'b')),
 ('d', (3, 'd')),
 ('&', (24, '&&&'))]

In [5]:
x = sc.parallelize([("a", 1,100), ("b", 3,300), ("a", 2,2000)])
x_k = x.keyBy(lambda x: x[0])
x_k.take(1)

[('a', ('a', 1, 100))]

In [14]:
# change row to new_row--> still return a tuple?
def createCombiner(row):
    a = len(row)
    print(len(row))
    return (row[1],row[2], a)


# which merges V into C
def mergeValue(row, new_row):
    return (row[1] + new_row[0], row[2] + new_row[1])
    

# combine two C's (new row)
def mergeCombiners(r1, r2):
    return (r1[0]+r2[0], r1[1]+r2[1])


In [17]:
r = x_k.combineByKey(createCombiner, mergeValue, mergeCombiners)
r.take(1)

[('a', (3, 2100))]

In [80]:
k_rdd.groupByKey().mapValues(list).collect()

[('a', [(1, 'a')]),
 ('c', [(2, 'c')]),
 ('e', [(5, 'e')]),
 ('b', [(1, 'b')]),
 ('d', [(3, 'd')]),
 ('&', [(8, '&'), (8, '&'), (8, '&')])]

In [23]:
rdd.collect()

[1, 1, 2, 3, 5, 8]

In [15]:
# return rez as a tuple ([], count_tweets)
def seqOp(rez, row):
    
    rez[0] + 
    pass
def comOp(p1, p2):
    pass

r = df_test.rdd.aggregate(([],0), seqOp, comOp)


MapPartitionsRDD[16] at javaToPython at NativeMethodAccessorImpl.java:0

In [109]:
# Test Space

rez = split(df_test.col1,'[0-9]+')
rez

Column<split(col1, [0-9]+)>

In [110]:
df_test.show()

+-------------+----+
|         col1|col2|
+-------------+----+
|     abc12edf|  11|
|ddddd12bldasd|  22|
+-------------+----+



In [111]:
df_test.select(rez.alias('split_op')).show()
print("df_test is not affected ")
df_test.show()

+---------------+
|       split_op|
+---------------+
|     [abc, edf]|
|[ddddd, bldasd]|
+---------------+

df_test is not affected 
+-------------+----+
|         col1|col2|
+-------------+----+
|     abc12edf|  11|
|ddddd12bldasd|  22|
+-------------+----+



In [112]:
def map_function(partition):
    for row in partition:
        row[1] + 'dota2'
    
t1.foreachPartition(map_function)
t1.show()


+---+-----+
|age| name|
+---+-----+
|  1|Alice|
| 11|  Bob|
+---+-----+



In [113]:
df_test.show()

+-------------+----+
|         col1|col2|
+-------------+----+
|     abc12edf|  11|
|ddddd12bldasd|  22|
+-------------+----+



In [114]:
df_test.printSchema()

root
 |-- col1: string (nullable = true)
 |-- col2: long (nullable = true)



In [115]:
a = df_test.select('col1')

In [118]:
ar = a.rdd
ar.take(1)

[Row(col1=u'abc12edf')]

In [31]:
# type(a) --> pyspark.sql.dataframe.DataFrame
# when map --> make sure it's in a tuple!!!!!!
b = a.rdd.map(lambda x: (x[0].encode('utf-8')+'dota2',))
type(b)
#b.collect()

pyspark.rdd.PipelinedRDD

In [35]:
t = b.collect()

[('abc12edfdota2',), ('ddddd12bldasddota2',)]

In [43]:
r= b.map(lambda x: (x[0] + '_added' ))
r.collect()

['abc12edfdota2_added', 'ddddd12bldasddota2_added']

In [41]:
# import all possible types
from pyspark.sql.types import *

# class pyspark.sql.types.StructField(name, dataType,
# nullable=True, metadata=None)
schema =StructType([StructField("col2", StringType(), True),])
df_modified = spark.createDataFrame(b, schema)

In [34]:
df_modified.show()

+------------------+
|              col2|
+------------------+
|     abc12edfdota2|
|ddddd12bldasddota2|
+------------------+

