### PySpark Introduction

In [None]:
!pip install pyspark

In [None]:
!pip install pandas numpy

In [1]:
import pyspark

In [2]:
import pandas as pd

df = pd.read_csv('Test1.csv')
df.head()

Unnamed: 0,Name,Age
0,Krish,31
1,Sudanshu,30
2,Sunny,29


In [3]:
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName('Practise').getOrCreate()

In [5]:
spark

In [6]:
df_pyspark = spark.read.csv('Test1.csv')

In [8]:
df_pyspark.show()

+--------+---+
|     _c0|_c1|
+--------+---+
|    Name|Age|
|   Krish| 31|
|Sudanshu| 30|
|   Sunny| 29|
+--------+---+



In [10]:
spark.read.option('header','true').csv('Test1.csv').show()

+--------+---+
|    Name|Age|
+--------+---+
|   Krish| 31|
|Sudanshu| 30|
|   Sunny| 29|
+--------+---+



In [11]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [13]:
df_pyspark.head(3)

[Row(_c0='Name', _c1='Age'),
 Row(_c0='Krish', _c1='31'),
 Row(_c0='Sudanshu', _c1='30')]

In [14]:
df_pyspark.printSchema()

root
 |-- _c0: string (nullable = true)
 |-- _c1: string (nullable = true)



## Dataframe

In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('Dataframe').getOrCreate()

In [3]:
spark

In [11]:
# Read dataset
df_pyspark = spark.read.option('header','true').csv('Test1.csv',inferSchema=True)

In [12]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [14]:
df_pyspark = spark.read.csv('Test1.csv', header = True, inferSchema=True)
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudanshu| 30|         8|
|   Sunny| 29|         6|
+--------+---+----------+



In [15]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



In [16]:
df_pyspark.columns

['Name', 'Age', 'Experience']

In [17]:
df_pyspark.head(3)

[Row(Name='Krish', Age=31, Experience=10),
 Row(Name='Sudanshu', Age=30, Experience=8),
 Row(Name='Sunny', Age=29, Experience=6)]

In [18]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudanshu| 30|         8|
|   Sunny| 29|         6|
+--------+---+----------+



In [20]:
# Select the column
df_pyspark.select(['Name','Experience']).show()

+--------+----------+
|    Name|Experience|
+--------+----------+
|   Krish|        10|
|Sudanshu|         8|
|   Sunny|         6|
+--------+----------+



In [22]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int')]

In [25]:
df_pyspark.describe().show()

+-------+-----+----+----------+
|summary| Name| Age|Experience|
+-------+-----+----+----------+
|  count|    3|   3|         3|
|   mean| null|30.0|       8.0|
| stddev| null| 1.0|       2.0|
|    min|Krish|  29|         6|
|    max|Sunny|  31|        10|
+-------+-----+----+----------+



In [26]:
# Adding columns in Dataframe, it is not inplace
df_pyspark = df_pyspark.withColumn('Experience After 2 Yr',df_pyspark['Experience']+2)

In [27]:
df_pyspark.show()

+--------+---+----------+---------------------+
|    Name|Age|Experience|Experience After 2 Yr|
+--------+---+----------+---------------------+
|   Krish| 31|        10|                   12|
|Sudanshu| 30|         8|                   10|
|   Sunny| 29|         6|                    8|
+--------+---+----------+---------------------+



In [28]:
# Drop Column
df_pyspark = df_pyspark.drop('Experience After 2 Yr')

In [29]:
df_pyspark.show()

+--------+---+----------+
|    Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudanshu| 30|         8|
|   Sunny| 29|         6|
+--------+---+----------+



In [30]:
# Rename the Column
df_pyspark = df_pyspark.withColumnRenamed('Name','New Name')

In [31]:
df_pyspark.show()

+--------+---+----------+
|New Name|Age|Experience|
+--------+---+----------+
|   Krish| 31|        10|
|Sudanshu| 30|         8|
|   Sunny| 29|         6|
+--------+---+----------+

