<a href="https://colab.research.google.com/github/saishshinde15/PySpark_Codes/blob/main/Basic_Opertions_DataFrame_Pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
pip install pyspark



In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName("Basic_DataFrame_Opertions").getOrCreate()


In [6]:
df=spark.read.csv('/content/property-sales.csv',header=True)

In [7]:
# show a printed representation of the DataFrame
df.show()

+-------------------+--------------+-----------+-------------+-----------------+
|           Address |          Type|      City |SalePrice ($)|            Agent|
+-------------------+--------------+-----------+-------------+-----------------+
|   1 Rowley Street |Detached House|   New York|       745000|Penelope Pullman |
|13a lollipop avenue|     Apartment|Los Angeles|       345000|      Jack Smith |
|       34 the drive|         House|    Atlanta|       459000|     Sheila Sammi|
+-------------------+--------------+-----------+-------------+-----------------+



In [8]:
# interactive view of the DataFrame
display(df)

DataFrame[Address : string, Type: string, City : string, SalePrice ($): string, Agent: string]

In [9]:
# Show me just the first two rows of my DataFrame
display(df.head(2))

[Row(Address ='1 Rowley Street ', Type='Detached House', City ='New York', SalePrice ($)='745000', Agent='Penelope Pullman '),
 Row(Address ='13a lollipop avenue', Type='Apartment', City ='Los Angeles', SalePrice ($)='345000', Agent='Jack Smith ')]

## Schemas

In [10]:
df.printSchema()

root
 |-- Address : string (nullable = true)
 |-- Type: string (nullable = true)
 |-- City : string (nullable = true)
 |-- SalePrice ($): string (nullable = true)
 |-- Agent: string (nullable = true)



In [11]:
# get just the data types (not full schema)
df.dtypes

[('Address ', 'string'),
 ('Type', 'string'),
 ('City ', 'string'),
 ('SalePrice ($)', 'string'),
 ('Agent', 'string')]

In [12]:
# the actual schema can be accessed using df.schema
df.schema

StructType([StructField('Address ', StringType(), True), StructField('Type', StringType(), True), StructField('City ', StringType(), True), StructField('SalePrice ($)', StringType(), True), StructField('Agent', StringType(), True)])

In [14]:
# this is sometimes useful as we might have to do something like this
source_schema = df.schema

# this saves us having to explicitly write out our the schema for a new df, if we have one that already exists.
new_df_with_existing_schema = spark.read.csv( schema=source_schema,path='/content/property-sales.csv',header=True)

In [15]:
new_df_with_existing_schema.show()

+-------------------+--------------+-----------+-------------+-----------------+
|           Address |          Type|      City |SalePrice ($)|            Agent|
+-------------------+--------------+-----------+-------------+-----------------+
|   1 Rowley Street |Detached House|   New York|       745000|Penelope Pullman |
|13a lollipop avenue|     Apartment|Los Angeles|       345000|      Jack Smith |
|       34 the drive|         House|    Atlanta|       459000|     Sheila Sammi|
+-------------------+--------------+-----------+-------------+-----------------+



## Columns

In [16]:
df.columns

['Address ', 'Type', 'City ', 'SalePrice ($)', 'Agent']

In [19]:
#selecting just a single column
df.select('Type').show()

+--------------+
|          Type|
+--------------+
|Detached House|
|     Apartment|
|         House|
+--------------+



In [20]:
#renaming existing columns
df = df.withColumnRenamed('Address ', 'Address')
df.select('Address').show()

+-------------------+
|            Address|
+-------------------+
|   1 Rowley Street |
|13a lollipop avenue|
|       34 the drive|
+-------------------+



In [21]:
# selecting a few columns
df.select(['Address','Type']).show()

+-------------------+--------------+
|            Address|          Type|
+-------------------+--------------+
|   1 Rowley Street |Detached House|
|13a lollipop avenue|     Apartment|
|       34 the drive|         House|
+-------------------+--------------+



In [22]:
# To add new column
df = df.withColumn('2x_SalePrice', df['SalePrice ($)'] * 2)
df.show()


+-------------------+--------------+-----------+-------------+-----------------+------------+
|            Address|          Type|      City |SalePrice ($)|            Agent|2x_SalePrice|
+-------------------+--------------+-----------+-------------+-----------------+------------+
|   1 Rowley Street |Detached House|   New York|       745000|Penelope Pullman |   1490000.0|
|13a lollipop avenue|     Apartment|Los Angeles|       345000|      Jack Smith |    690000.0|
|       34 the drive|         House|    Atlanta|       459000|     Sheila Sammi|    918000.0|
+-------------------+--------------+-----------+-------------+-----------------+------------+



In [23]:
# To Remove Column
df = df.drop('2x_SalePrice')
df.show()

+-------------------+--------------+-----------+-------------+-----------------+
|            Address|          Type|      City |SalePrice ($)|            Agent|
+-------------------+--------------+-----------+-------------+-----------------+
|   1 Rowley Street |Detached House|   New York|       745000|Penelope Pullman |
|13a lollipop avenue|     Apartment|Los Angeles|       345000|      Jack Smith |
|       34 the drive|         House|    Atlanta|       459000|     Sheila Sammi|
+-------------------+--------------+-----------+-------------+-----------------+



In [24]:
# Renaming Multiple Columns
df_new = df.selectExpr("Address as ADD","'SalePrice ($)' as SalesPrice_USD","'City ' as MyCity")
df_new.show()

+-------------------+--------------+------+
|                ADD|SalesPrice_USD|MyCity|
+-------------------+--------------+------+
|   1 Rowley Street | SalePrice ($)| City |
|13a lollipop avenue| SalePrice ($)| City |
|       34 the drive| SalePrice ($)| City |
+-------------------+--------------+------+



## Using Pandas

In [25]:
import pandas as pd
import numpy as np
import pyspark.pandas as ps



In [26]:
df2=ps.read_csv('/content/property-sales.csv')



In [27]:
df2

Unnamed: 0,Address,Type,City,SalePrice ($),Agent
0,1 Rowley Street,Detached House,New York,745000,Penelope Pullman
1,13a lollipop avenue,Apartment,Los Angeles,345000,Jack Smith
2,34 the drive,House,Atlanta,459000,Sheila Sammi


In [29]:
#Selecting a column
df_column=df2['SalePrice ($)']
df_column

0    745000
1    345000
2    459000
Name: SalePrice ($), dtype: int32

In [30]:
# Renaming Columns
df_rename=df2.rename(columns={'Address ':'Address1'})
df_rename

Unnamed: 0,Address1,Type,City,SalePrice ($),Agent
0,1 Rowley Street,Detached House,New York,745000,Penelope Pullman
1,13a lollipop avenue,Apartment,Los Angeles,345000,Jack Smith
2,34 the drive,House,Atlanta,459000,Sheila Sammi


In [31]:
# Droping Columns
df_drop=df2.drop(columns=['Address '])
df_drop

Unnamed: 0,Type,City,SalePrice ($),Agent
0,Detached House,New York,745000,Penelope Pullman
1,Apartment,Los Angeles,345000,Jack Smith
2,House,Atlanta,459000,Sheila Sammi
