## Importing pyspark, findspark

In [1]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('Pyspark_examples_02').getOrCreate()

## Creating a RDD using spark.createDataFrame()

In [2]:
data = [("James","Smith","USA","CA"),("Michael","Rose","USA","NY"), \
    ("Robert","Williams","USA","CA"),("Maria","Jones","USA","FL") \
  ]
columns=["firstname","lastname","country","state"]

In [4]:
df = spark.createDataFrame(data=data, schema=columns)
df.show()
df.collect()

+---------+--------+-------+-----+
|firstname|lastname|country|state|
+---------+--------+-------+-----+
|    James|   Smith|    USA|   CA|
|  Michael|    Rose|    USA|   NY|
|   Robert|Williams|    USA|   CA|
|    Maria|   Jones|    USA|   FL|
+---------+--------+-------+-----+



[Row(firstname='James', lastname='Smith', country='USA', state='CA'),
 Row(firstname='Michael', lastname='Rose', country='USA', state='NY'),
 Row(firstname='Robert', lastname='Williams', country='USA', state='CA'),
 Row(firstname='Maria', lastname='Jones', country='USA', state='FL')]

## Selecting specific column

In [6]:
states_data = df.rdd.map(lambda x: x[3]).collect()
states_data

['CA', 'NY', 'CA', 'FL']

## Removing Duplicates from the output

In [7]:
from collections import OrderedDict
res = list(OrderedDict.fromkeys(states_data))
res

['CA', 'NY', 'FL']

## Selecting specific column

In [8]:
states2=df.rdd.map(lambda x: x.state).collect()
print(states2)

['CA', 'NY', 'CA', 'FL']


In [9]:
states3=df.select(df.state).collect()
print(states3)

[Row(state='CA'), Row(state='NY'), Row(state='CA'), Row(state='FL')]


In [10]:
states4=df.select(df.state).rdd.flatMap(lambda x: x).collect()
print(states4)

['CA', 'NY', 'CA', 'FL']


## Converting a RDD to pandas Data Frame

In [11]:
states5=df.select(df.state).toPandas()['state']
states6=list(states5)
print(states6)

['CA', 'NY', 'CA', 'FL']


In [12]:
pandDF=df.select(df.state,df.firstname).toPandas()
print(list(pandDF['state']))
print(list(pandDF['firstname']))

['CA', 'NY', 'CA', 'FL']
['James', 'Michael', 'Robert', 'Maria']


In [15]:
pandDF

Unnamed: 0,state,firstname
0,CA,James
1,NY,Michael
2,CA,Robert
3,FL,Maria


## creating a RDD using pyspark.sql.types, pyspark.sql.functions

In [16]:
from pyspark.sql.functions import col
from pyspark.sql.functions import to_timestamp, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, LongType

In [17]:
temp_schema = StructType([StructField("seq", StringType(), True)])
new_data = [('1',)]  # Corrected line, using a tuple instead of a list
new_df = spark.createDataFrame(new_data, schema=temp_schema)
new_df.show()

+---+
|seq|
+---+
|  1|
+---+



## Creating a python dataframe using pd.DataFrame

In [24]:
import pandas as pd
new_data1 = [['Scott', 50], ['Jeff', 45], ['Thomas', 54],['Ann',34]] 
pandasDF = pd.DataFrame(data=new_data1, columns=['Name', 'age'])
pandasDF

Unnamed: 0,Name,age
0,Scott,50
1,Jeff,45
2,Thomas,54
3,Ann,34


In [26]:
sparkDF1 = spark.createDataFrame(pandasDF)
sparkDF1.show()

+------+---+
|  Name|age|
+------+---+
| Scott| 50|
|  Jeff| 45|
|Thomas| 54|
|   Ann| 34|
+------+---+



In [27]:
sparkDF1.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: long (nullable = true)



### using collect to print all rows of the RDD

In [30]:
sparkDF1.collect()

[Row(Name='Scott', age=50),
 Row(Name='Jeff', age=45),
 Row(Name='Thomas', age=54),
 Row(Name='Ann', age=34)]

### using count to print the number all rows of the RDD

In [31]:
sparkDF1.count()

4

### using take to print the first n no of rows of the RDD

In [32]:
sparkDF1.take(2)

[Row(Name='Scott', age=50), Row(Name='Jeff', age=45)]

### using first prints the first row of the RDD

In [33]:
sparkDF1.first()

Row(Name='Scott', age=50)

### using parallelize

In [34]:
from pyspark import SparkContext
sc = SparkContext.getOrCreate()

### using reduce

In [35]:
reduced_rdd = sc.parallelize([1,2,3,4,5,6])
print(reduced_rdd.reduce(lambda x,y:x+y))

21


### using saveAsTextFile

In [None]:
reduced_rdd.saveAsTextFile('file')

### Selecting multiple columns

In [43]:
multi_cols = sparkDF1.select(['Name','age'])
multi_cols.show()

+------+---+
|  Name|Age|
+------+---+
| Scott| 50|
|  Jeff| 45|
|Thomas| 54|
|   Ann| 34|
+------+---+

