<a href="https://colab.research.google.com/github/saishshinde15/PySpark_Codes/blob/main/Reading_And_Writing_CSV%2CJSON_In_PySpark.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark



In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Reading_CSV").getOrCreate()

## Reading a csv using pyspark sql


In [4]:
csv_path = '/content/property-sales.csv'

# Read a csv file from Files/property-sales.csv
df_csv = spark.read.csv(csv_path, header=True)

display(df_csv)

DataFrame[Address : string, Type: string, City : string, SalePrice ($): string, Agent: string]

In [6]:
df_csv.dtypes

[('Address ', 'string'),
 ('Type', 'string'),
 ('City ', 'string'),
 ('SalePrice ($)', 'string'),
 ('Agent', 'string')]

In [8]:
from pyspark.sql.types import IntegerType # changing dtypes

df_csv = df_csv.withColumn("SalePrice ($)", df_csv["SalePrice ($)"].cast(IntegerType()))
df_csv.dtypes
display(df_csv)


DataFrame[Address : string, Type: string, City : string, SalePrice ($): int, Agent: string]

In [11]:
df_csv.show()

+-------------------+--------------+-----------+-------------+-----------------+
|           Address |          Type|      City |SalePrice ($)|            Agent|
+-------------------+--------------+-----------+-------------+-----------------+
|   1 Rowley Street |Detached House|   New York|       745000|Penelope Pullman |
|13a lollipop avenue|     Apartment|Los Angeles|       345000|      Jack Smith |
|       34 the drive|         House|    Atlanta|       459000|     Sheila Sammi|
+-------------------+--------------+-----------+-------------+-----------------+



## Write Json files(convert csv in json file)

In [9]:
df_csv.write.json("/content/Different_Files/json/property-sales.json", mode='overwrite') #/content/Different_Files

# Read json file

In [10]:
df_json=spark.read.json("/content/Different_Files/json/property-sales.json")
display(df_json)

DataFrame[Address : string, Agent: string, City : string, SalePrice ($): bigint, Type: string]

In [13]:
df_json.show()

+-------------------+-----------------+-----------+-------------+--------------+
|           Address |            Agent|      City |SalePrice ($)|          Type|
+-------------------+-----------------+-----------+-------------+--------------+
|   1 Rowley Street |Penelope Pullman |   New York|       745000|Detached House|
|13a lollipop avenue|      Jack Smith |Los Angeles|       345000|     Apartment|
|       34 the drive|     Sheila Sammi|    Atlanta|       459000|         House|
+-------------------+-----------------+-----------+-------------+--------------+



# Using pyspark pandas

In [17]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps



In [21]:
df_csv_pandas= ps.read_csv("/content/property-sales.csv")

In [22]:
df_csv_pandas

Unnamed: 0,Address,Type,City,SalePrice ($),Agent
0,1 Rowley Street,Detached House,New York,745000,Penelope Pullman
1,13a lollipop avenue,Apartment,Los Angeles,345000,Jack Smith
2,34 the drive,House,Atlanta,459000,Sheila Sammi


### In pyspark pandas you can only read the file but now write the file i.e convert it in json or any other format.
### To do so you need to change/convert the pandas to dataframe again

# Convert Pandas to DataFrame

In [23]:
df_csv_convert=df_csv_pandas.to_spark()




In [24]:
display(df_csv_convert)

DataFrame[Address : string, Type: string, City : string, SalePrice ($): int, Agent: string]

In [25]:
df_csv_convert.show()

+-------------------+--------------+-----------+-------------+-----------------+
|           Address |          Type|      City |SalePrice ($)|            Agent|
+-------------------+--------------+-----------+-------------+-----------------+
|   1 Rowley Street |Detached House|   New York|       745000|Penelope Pullman |
|13a lollipop avenue|     Apartment|Los Angeles|       345000|      Jack Smith |
|       34 the drive|         House|    Atlanta|       459000|     Sheila Sammi|
+-------------------+--------------+-----------+-------------+-----------------+



In [27]:
df_csv_convert.write.json("/content/Different_Files/json/property-sales1.json", mode='overwrite') #/content/Different_Files

In [28]:
df_json_after_converstion=spark.read.json("/content/Different_Files/json/property-sales1.json")
display(df_json_after_converstion)

DataFrame[Address : string, Agent: string, City : string, SalePrice ($): bigint, Type: string]

In [29]:
df_json_after_converstion.show()

+-------------------+-----------------+-----------+-------------+--------------+
|           Address |            Agent|      City |SalePrice ($)|          Type|
+-------------------+-----------------+-----------+-------------+--------------+
|   1 Rowley Street |Penelope Pullman |   New York|       745000|Detached House|
|13a lollipop avenue|      Jack Smith |Los Angeles|       345000|     Apartment|
|       34 the drive|     Sheila Sammi|    Atlanta|       459000|         House|
+-------------------+-----------------+-----------+-------------+--------------+

