In [2]:
import pyspark

In [3]:
from pyspark.sql import SparkSession

In [4]:
from pyspark.sql.functions import *

In [5]:
spark = SparkSession.builder.appName("Yudi Dataframe").getOrCreate()

In [6]:
spark

In [8]:
filepath = r'D:\python\meTryPython\learnspark\first\resources\test1.csv'

In [9]:
df_spark = spark.read.option('header','true').csv(filepath,inferSchema=True)
df_spark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sundhansh| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 21|         2| 18000|
+---------+---+----------+------+



In [10]:
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [11]:
df_spark=spark.read.csv(filepath,header=True,inferSchema=True)
df_spark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sundhansh| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 21|         2| 18000|
+---------+---+----------+------+



In [12]:
df_spark.count()

6

In [13]:
type(df_spark)

pyspark.sql.dataframe.DataFrame

In [15]:
df_spark.columns

['Name', 'Age', 'Experience', 'Salary']

In [17]:
df_spark.select('Age').show()

+---+
|Age|
+---+
| 31|
| 30|
| 29|
| 24|
| 21|
| 21|
+---+



In [18]:
df_spark.select(['Name','Experience']).show()

+---------+----------+
|     Name|Experience|
+---------+----------+
|    Krish|        10|
|Sundhansh|         8|
|    Sunny|         4|
|     Paul|         3|
|   Harsha|         1|
|  Shubham|         2|
+---------+----------+



In [20]:
df_spark.select("Name",when(df_spark['Experience'] >=5,"Senior")
                .when((df_spark['Experience'] <=5) & (df_spark['Experience']  >2),"Associate")\
                .otherwise("Junior")
                .alias(" exp_name")).show()

+---------+---------+
|     Name| exp_name|
+---------+---------+
|    Krish|   Senior|
|Sundhansh|   Senior|
|    Sunny|Associate|
|     Paul|Associate|
|   Harsha|   Junior|
|  Shubham|   Junior|
+---------+---------+



In [21]:
df_spark.select("Name", 
                when(df_spark.Experience >= 5, "Senior")
                .when((df_spark.Experience <= 5) & (df_spark.Experience > 2), "Associate")
                .otherwise("Junior")
                .alias(" exp_name"),
                ).show()

+---------+---------+
|     Name| exp_name|
+---------+---------+
|    Krish|   Senior|
|Sundhansh|   Senior|
|    Sunny|Associate|
|     Paul|Associate|
|   Harsha|   Junior|
|  Shubham|   Junior|
+---------+---------+



In [23]:
df_spark.describe()

DataFrame[summary: string, Name: string, Age: string, Experience: string, Salary: string]

In [24]:
df_spark.dtypes

[('Name', 'string'), ('Age', 'int'), ('Experience', 'int'), ('Salary', 'int')]

In [28]:
df_spark.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sundhansh| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 21|         2| 18000|
+---------+---+----------+------+



In [25]:
import pyspark.sql.functions as F

In [35]:
agg_df_spark = df_spark.agg(
    F.mean("Salary").alias("mean_column1"),
    F.stddev("Salary").alias("stddev_column1"),
    F.min("Salary").alias("min_column1"),
    F.max("Salary").alias("max_column1"))

agg_df_spark.show()

+------------------+-----------------+-----------+-----------+
|      mean_column1|   stddev_column1|min_column1|max_column1|
+------------------+-----------------+-----------+-----------+
|21333.333333333332|5354.126134736337|      15000|      30000|
+------------------+-----------------+-----------+-----------+



In [39]:
df_spark_2year = df_spark.withColumn('Experience After 2 years',
                               df_spark['Experience']+2)
df_spark_2year.show()

+---------+---+----------+------+------------------------+
|     Name|Age|Experience|Salary|Experience After 2 years|
+---------+---+----------+------+------------------------+
|    Krish| 31|        10| 30000|                      12|
|Sundhansh| 30|         8| 25000|                      10|
|    Sunny| 29|         4| 20000|                       6|
|     Paul| 24|         3| 20000|                       5|
|   Harsha| 21|         1| 15000|                       3|
|  Shubham| 21|         2| 18000|                       4|
+---------+---+----------+------+------------------------+



In [41]:
df_spark_newName = df_spark.withColumnRenamed('Name','New Name')
df_spark_newName.show()
df_spark.show()

+---------+---+----------+------+
| New Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sundhansh| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 21|         2| 18000|
+---------+---+----------+------+

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sundhansh| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 21|         2| 18000|
+---------+---+----------+------+



In [103]:
from pyspark.sql import Row
import findspark
findspark.init()

# new_row = Row(Name='Sundhansh', Age=45, Experience=20, Salary= 55000)
# new_row['Sundhansh',45,20,55000]
# new_df = spark.createDataFrame([new_row], schema=df_spark.schema)
df_spark2 = df_spark.union(spark.createDataFrame([new_row]).toDF('Name', 'Age', 'Experience', 'Salary'))

In [105]:
type(df_spark2)

pyspark.sql.dataframe.DataFrame

In [107]:
# error on calling method show() to appended dataframe from 2 cell above
# df_spark2.show()

In [51]:
df_spark_newName.groupBy("New Name", 
                         "Salary",
                        ).max().show()

+---------+------+--------+---------------+-----------+
| New Name|Salary|max(Age)|max(Experience)|max(Salary)|
+---------+------+--------+---------------+-----------+
|Sundhansh| 25000|      30|              8|      25000|
|     Paul| 20000|      24|              3|      20000|
|    Sunny| 20000|      29|              4|      20000|
|   Harsha| 15000|      21|              1|      15000|
|    Krish| 30000|      31|             10|      30000|
|  Shubham| 18000|      21|              2|      18000|
+---------+------+--------+---------------+-----------+

