In [1]:
import pyspark

In [2]:
from pyspark.sql import SparkSession

In [3]:
from pyspark.sql.functions import *

In [6]:
spark=SparkSession.builder.appName('Dataframeaa').getOrCreate()

In [28]:
spark

In [6]:
df_spark = spark.read.option('header','true').csv('test1.csv',inferSchema=True)

In [7]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [8]:
df_spark=spark.read.csv('test1.csv',header=True,inferSchema=True)
df_spark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sundhansh| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 21|         2| 18000|
+---------+---+----------+------+



In [9]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [10]:
df_spark.count()

6

In [11]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [12]:
df_spark.columns

['Name', 'Age', 'Experience', 'Salary']

In [13]:
df_spark.head(3)

[Row(Name='Krish', Age=31, Experience=10, Salary=30000),
 Row(Name='Sundhansh', Age=30, Experience=8, Salary=25000),
 Row(Name='Sunny', Age=29, Experience=4, Salary=20000)]

In [14]:
df_spark.tail(1)

[Row(Name='Shubham', Age=21, Experience=2, Salary=18000)]

In [15]:
df_spark.select('Name')

DataFrame[Name: string]

In [16]:
df_name = df_spark.select('Name')
df_name.show()

+---------+
|     Name|
+---------+
|    Krish|
|Sundhansh|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [17]:
df_spark.select('Name').show()

+---------+
|     Name|
+---------+
|    Krish|
|Sundhansh|
|    Sunny|
|     Paul|
|   Harsha|
|  Shubham|
+---------+



In [18]:
df_spark.select(['Name','Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sundhansh|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



In [19]:
df_spark.select("Name",when(df_spark.Experience>=5,"Senior")
                .when((df_spark.Experience <=5) & (df_spark.Experience >2),"Associate")\
                .otherwise("Junior").alias(" exp_name")).show()

+---------+---------+
|     Name| exp_name|
+---------+---------+
|    Krish|   Senior|
|Sundhansh|   Senior|
|    Sunny|Associate|
|     Paul|Associate|
|   Harsha|   Junior|
|  Shubham|   Junior|
+---------+---------+



In [20]:
df_spark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [21]:
df_spark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string, Salary: string]

In [22]:
df_spark.describe().show()

+-------+------+-----------------+-----------------+------------------+
|summary|  Name|              Age|       Experience|            Salary|
+-------+------+-----------------+-----------------+------------------+
|  count|     6|                6|                6|                 6|
|   mean|  null|             26.0|4.666666666666667|21333.333333333332|
| stddev|  null|4.560701700396552|3.559026084010437| 5354.126134736337|
|    min|Harsha|               21|                1|             15000|
|    max| Sunny|               31|               10|             30000|
+-------+------+-----------------+-----------------+------------------+



In [23]:
df_spark = df_spark.withColumn('Experience After 2 years',df_spark['Experience']+2)
df_spark.show()

+---------+---+----------+------+------------------------+
|     Name|Age|Experience|Salary|Experience After 2 years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                      12|
|Sundhansh| 30|         8| 25000|                      10|
|    Sunny| 29|         4| 20000|                       6|
|     Paul| 24|         3| 20000|                       5|
|   Harsha| 21|         1| 15000|                       3|
|  Shubham| 21|         2| 18000|                       4|
+---------+---+----------+------+------------------------+



In [24]:
df_spark = df_spark.drop('Experience After 2 years')
df_spark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sundhansh| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 21|         2| 18000|
+---------+---+----------+------+



In [25]:
df_spark = df_spark.withColumnRenamed('Name','New Name')
df_spark.show()

+---------+---+----------+------+
| New Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sundhansh| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 21|         2| 18000|
+---------+---+----------+------+



In [26]:
df_spark.groupBy("New Name", "Salary").max().show()

+---------+------+--------+---------------+-----------+
| New Name|Salary|max(Age)|max(Experience)|max(Salary)|
+---------+------+--------+---------------+-----------+
|Sundhansh| 25000|      30|              8|      25000|
|     Paul| 20000|      24|              3|      20000|
|    Sunny| 20000|      29|              4|      20000|
|   Harsha| 15000|      21|              1|      15000|
|    Krish| 30000|      31|             10|      30000|
|  Shubham| 18000|      21|              2|      18000|
+---------+------+--------+---------------+-----------+

