'c:\\Users\\Shankii\\anaconda3\\envs\\pyspark\\lib\\site-packages\\pyspark'

In [3]:
import pyspark
from pyspark.sql import SparkSession

In [4]:
spark = SparkSession.builder.appName("DataFramePractise").getOrCreate()

## Method 01

In [5]:
## read the dataset

df_pyspark = spark.read.option('header','True').csv('test1.csv', inferSchema = True)
df_pyspark.show()

## if you don't put inferschema = True: then all the variable types will be considered as String

## Datatypes of columns

In [143]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



## Method 02

In [144]:
df_pyspark = spark.read.csv('test1.csv', header = True, inferSchema = True)
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [145]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

## Selecting Columns

In [146]:
df_pyspark.show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [147]:
df_pyspark.select("Name").show()

+---------+
|     Name|
+---------+
|    Krish|
|Sudhanshu|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [148]:
df_pyspark.select(['Name','Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sudhanshu|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



## Slicing in DataFrame

In [149]:
df1 = df_pyspark.limit(3)
df2 = df_pyspark.subtract(df1)

In [150]:
df1.show(), df2.show(), df_pyspark.show()


+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
+---------+---+----------+------+

+-------+---+----------+------+
|   Name|age|Experience|Salary|
+-------+---+----------+------+
|Shubham| 23|         2| 18000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
+-------+---+----------+------+

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



(None, None, None)

In [151]:
df_pyspark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [152]:
df_pyspark.describe().show()

## min max on string is decided via the lexicographic order


+-------+------+------------------+-----------------+------------------+
|summary|  Name|               age|       Experience|            Salary|
+-------+------+------------------+-----------------+------------------+
|  count|     6|                 6|                6|                 6|
|   mean|  null|26.333333333333332|4.666666666666667|21333.333333333332|
| stddev|  null| 4.179314138308661|3.559026084010437| 5354.126134736337|
|    min|Harsha|                21|                1|             15000|
|    max| Sunny|                31|               10|             30000|
+-------+------+------------------+-----------------+------------------+



## Adding columns in pyspark dataframe

In [153]:
### this wont be an inplace operation
df_pyspark.withColumn("Experience after 2 years", df_pyspark['Experience']+2).show()

+---------+---+----------+------+------------------------+
|     Name|age|Experience|Salary|Experience after 2 years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                      12|
|Sudhanshu| 30|         8| 25000|                      10|
|    Sunny| 29|         4| 20000|                       6|
|     Paul| 24|         3| 20000|                       5|
|   Harsha| 21|         1| 15000|                       3|
|  Shubham| 23|         2| 18000|                       4|
+---------+---+----------+------+------------------------+



In [154]:
from pyspark.sql.functions import lit

## lit to add constant values to all the dataframes

df_pyspark=df_pyspark.withColumn("Experience after 2 years", lit(1))
df_pyspark.show()

+---------+---+----------+------+------------------------+
|     Name|age|Experience|Salary|Experience after 2 years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                       1|
|Sudhanshu| 30|         8| 25000|                       1|
|    Sunny| 29|         4| 20000|                       1|
|     Paul| 24|         3| 20000|                       1|
|   Harsha| 21|         1| 15000|                       1|
|  Shubham| 23|         2| 18000|                       1|
+---------+---+----------+------+------------------------+



## Dropping the Columns

In [155]:
## lets drop the column

df_pyspark.drop('Experience after 2 years').show()

+---------+---+----------+------+
|     Name|age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



## Renaming the column names

In [157]:
df_pyspark=df_pyspark.withColumnRenamed('Name','New Name')
df_pyspark.show()

+---------+---+----------+------+------------------------+
| New Name|age|Experience|Salary|Experience after 2 years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                       1|
|Sudhanshu| 30|         8| 25000|                       1|
|    Sunny| 29|         4| 20000|                       1|
|     Paul| 24|         3| 20000|                       1|
|   Harsha| 21|         1| 15000|                       1|
|  Shubham| 23|         2| 18000|                       1|
+---------+---+----------+------+------------------------+

