In [116]:
import pyspark
from pyspark.sql import SparkSession
import traceback

# Creating a Spark session

In [117]:
spark = SparkSession.builder.appName('practice').getOrCreate()
spark

# Reading a csv file

In [119]:
df = spark.read.csv('test1.csv', header= True, inferSchema=True)
# inferSchemastr or bool, optional. infers the input schema automatically from data.

In [120]:
df.show()

+---+-----+---+----------+
|_c0|names|age|experience|
+---+-----+---+----------+
|  0|  sai| 12|         1|
|  1|pawan| 24|         2|
|  2|    d| 36|         3|
+---+-----+---+----------+



24/02/25 20:41:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


In [121]:
df.printSchema()

root
 |-- _c0: integer (nullable = true)
 |-- names: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)



In [122]:
df_name_exp = df.select(['names', 'experience'])
df_name_exp.show()

print(type(df_name_exp))

+-----+----------+
|names|experience|
+-----+----------+
|  sai|         1|
|pawan|         2|
|    d|         3|
+-----+----------+

<class 'pyspark.sql.dataframe.DataFrame'>


In [123]:
df.dtypes

[('_c0', 'int'), ('names', 'string'), ('age', 'int'), ('experience', 'int')]

In [124]:
df.describe().show()

24/02/25 20:41:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


+-------+---+-----+----+----------+
|summary|_c0|names| age|experience|
+-------+---+-----+----+----------+
|  count|  3|    3|   3|         3|
|   mean|1.0| NULL|24.0|       2.0|
| stddev|1.0| NULL|12.0|       1.0|
|    min|  0|    d|  12|         1|
|    max|  2|  sai|  36|         3|
+-------+---+-----+----+----------+



# Columns in DataFrame

In [125]:
# Picking up a singel column will change the datatype to column not dataframe.
names = df_name_exp['names']
print(names, type(names))

try:
    # Show only works on the dataframe not on columns
    names.show()
except TypeError as e:
    print(traceback.print_exception(e))



Column<'names'> <class 'pyspark.sql.column.Column'>
None


Traceback (most recent call last):
  File "/var/folders/71/c7pdh8bj3p1chk4tv474y1g80000gn/T/ipykernel_16639/4111862367.py", line 9, in <module>
    names.show()
TypeError: 'Column' object is not callable


### Adding the columns

In [126]:
df_new_clm = df.withColumn('Experience after 2 years', df['experience'] + 2)
df_new_clm.show()


+---+-----+---+----------+------------------------+
|_c0|names|age|experience|Experience after 2 years|
+---+-----+---+----------+------------------------+
|  0|  sai| 12|         1|                       3|
|  1|pawan| 24|         2|                       4|
|  2|    d| 36|         3|                       5|
+---+-----+---+----------+------------------------+



24/02/25 20:41:46 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


### Droping the Column

In [127]:
df_new_clm.drop('Experience after 2 years').show()

+---+-----+---+----------+
|_c0|names|age|experience|
+---+-----+---+----------+
|  0|  sai| 12|         1|
|  1|pawan| 24|         2|
|  2|    d| 36|         3|
+---+-----+---+----------+



24/02/25 20:41:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


### Renaming the Column

In [128]:
df.withColumnRenamed('names', 'name').show()

+---+-----+---+----------+
|_c0| name|age|experience|
+---+-----+---+----------+
|  0|  sai| 12|         1|
|  1|pawan| 24|         2|
|  2|    d| 36|         3|
+---+-----+---+----------+



24/02/25 20:41:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


### Adding new rows using Union

In [129]:
from pyspark.sql import Row

new_rows = spark.createDataFrame(
    [
        (3, 'saipawan', 34, 4),
        (4, 'pawand', None, 5),
        (5, 'said', None, None),
        (None, None, None, None)
    ],
     df.columns
    )
new_rows.show()

df = df.union(new_rows)

df.show()


+----+--------+----+----------+
| _c0|   names| age|experience|
+----+--------+----+----------+
|   3|saipawan|  34|         4|
|   4|  pawand|NULL|         5|
|   5|    said|NULL|      NULL|
|NULL|    NULL|NULL|      NULL|
+----+--------+----+----------+

+----+--------+----+----------+
| _c0|   names| age|experience|
+----+--------+----+----------+
|   0|     sai|  12|         1|
|   1|   pawan|  24|         2|
|   2|       d|  36|         3|
|   3|saipawan|  34|         4|
|   4|  pawand|NULL|         5|
|   5|    said|NULL|      NULL|
|NULL|    NULL|NULL|      NULL|
+----+--------+----+----------+



24/02/25 20:41:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


# Manipulating NULL values

In [130]:
df.show()

24/02/25 20:41:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


+----+--------+----+----------+
| _c0|   names| age|experience|
+----+--------+----+----------+
|   0|     sai|  12|         1|
|   1|   pawan|  24|         2|
|   2|       d|  36|         3|
|   3|saipawan|  34|         4|
|   4|  pawand|NULL|         5|
|   5|    said|NULL|      NULL|
|NULL|    NULL|NULL|      NULL|
+----+--------+----+----------+



### Droping Null Values

In [131]:
# This will drop all the rows with Null values in it.
df.na.drop().show()

24/02/25 20:41:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


+---+--------+---+----------+
|_c0|   names|age|experience|
+---+--------+---+----------+
|  0|     sai| 12|         1|
|  1|   pawan| 24|         2|
|  2|       d| 36|         3|
|  3|saipawan| 34|         4|
+---+--------+---+----------+



In [132]:
df.na.drop(how='all').show()

df.na.drop(how='any').show()  # this is the default values

24/02/25 20:41:47 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


+---+--------+----+----------+
|_c0|   names| age|experience|
+---+--------+----+----------+
|  0|     sai|  12|         1|
|  1|   pawan|  24|         2|
|  2|       d|  36|         3|
|  3|saipawan|  34|         4|
|  4|  pawand|NULL|         5|
|  5|    said|NULL|      NULL|
+---+--------+----+----------+



24/02/25 20:41:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


+---+--------+---+----------+
|_c0|   names|age|experience|
+---+--------+---+----------+
|  0|     sai| 12|         1|
|  1|   pawan| 24|         2|
|  2|       d| 36|         3|
|  3|saipawan| 34|         4|
+---+--------+---+----------+



In [133]:
df.na.drop(how='any',thresh=2).show()
# This will delete all the rows with dosent have more than <thresh> non null values.

+---+--------+----+----------+
|_c0|   names| age|experience|
+---+--------+----+----------+
|  0|     sai|  12|         1|
|  1|   pawan|  24|         2|
|  2|       d|  36|         3|
|  3|saipawan|  34|         4|
|  4|  pawand|NULL|         5|
|  5|    said|NULL|      NULL|
+---+--------+----+----------+



24/02/25 20:41:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


In [134]:
df.na.drop(how='any', subset=['experience']).show()
# Specify which columns you want to focus on.

24/02/25 20:41:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


+---+--------+----+----------+
|_c0|   names| age|experience|
+---+--------+----+----------+
|  0|     sai|  12|         1|
|  1|   pawan|  24|         2|
|  2|       d|  36|         3|
|  3|saipawan|  34|         4|
|  4|  pawand|NULL|         5|
+---+--------+----+----------+



### Filling the missing values

In [135]:
df.na.fill('missing values', subset=['age', 'experience']).show()

24/02/25 20:41:48 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


+----+--------+----+----------+
| _c0|   names| age|experience|
+----+--------+----+----------+
|   0|     sai|  12|         1|
|   1|   pawan|  24|         2|
|   2|       d|  36|         3|
|   3|saipawan|  34|         4|
|   4|  pawand|NULL|         5|
|   5|    said|NULL|      NULL|
|NULL|    NULL|NULL|      NULL|
+----+--------+----+----------+



In [137]:
from pyspark.ml.feature import Imputer

cols = ['age', 'experience']

imputer = Imputer(
    inputCols = cols,
    outputCols = [ f'{col}_impute' for col in cols]
).setStrategy('mean')

imputer.fit(df).transform(df).show()

+----+--------+----+----------+----------+-----------------+
| _c0|   names| age|experience|age_impute|experience_impute|
+----+--------+----+----------+----------+-----------------+
|   0|     sai|  12|         1|        12|                1|
|   1|   pawan|  24|         2|        24|                2|
|   2|       d|  36|         3|        36|                3|
|   3|saipawan|  34|         4|        34|                4|
|   4|  pawand|NULL|         5|        26|                5|
|   5|    said|NULL|      NULL|        26|                3|
|NULL|    NULL|NULL|      NULL|        26|                3|
+----+--------+----+----------+----------+-----------------+



24/02/25 20:41:59 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


# Filter on DataFrame

In [143]:
df.show()


df.filter(
    (df['age'] > 15) & (df['age'] < 35)
).show()
# other operations are &,|, ==, ~

df.filter(
    ~(df['age'] < 15)
).show()

24/02/25 20:55:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv
24/02/25 20:55:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


+----+--------+----+----------+
| _c0|   names| age|experience|
+----+--------+----+----------+
|   0|     sai|  12|         1|
|   1|   pawan|  24|         2|
|   2|       d|  36|         3|
|   3|saipawan|  34|         4|
|   4|  pawand|NULL|         5|
|   5|    said|NULL|      NULL|
|NULL|    NULL|NULL|      NULL|
+----+--------+----+----------+

+---+--------+---+----------+
|_c0|   names|age|experience|
+---+--------+---+----------+
|  1|   pawan| 24|         2|
|  3|saipawan| 34|         4|
+---+--------+---+----------+

+---+--------+---+----------+
|_c0|   names|age|experience|
+---+--------+---+----------+
|  1|   pawan| 24|         2|
|  2|       d| 36|         3|
|  3|saipawan| 34|         4|
+---+--------+---+----------+



24/02/25 20:55:37 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , names, age, experience
 Schema: _c0, names, age, experience
Expected: _c0 but found: 
CSV file: file:///Users/dsaipawan/Documents/python-learing/big-data/test1.csv


# GroupBy and Aggrigate Functions

In [149]:
data = [['Krish', 'Data Science', 10000],
['Krish', 'IOT', 5000],
['Mahesh', 'Big Data', 4000],
['Krish', 'Big Data', 4000],
['Mahesh', 'Data Science', 3000],
['Sudhanshu', 'Data Science', 20000],
['Sudhanshu', 'IOT', 10000],
['Sudhanshu', 'Big Data', 5000],
['Sunny', 'Data Science', 10000],
['Sunny', 'Big Data', 2000]]

df = spark.createDataFrame(data, ['Name', 'Department', 'Salary'],)
df.show()


+---------+------------+------+
|     Name|  Department|Salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [161]:
df.groupBy('Name').sum().show()

df.groupBy('Department').avg().sort('avg(Salary)').show()

+---------+-----------+
|     Name|sum(Salary)|
+---------+-----------+
|    Krish|      19000|
|   Mahesh|       7000|
|Sudhanshu|      35000|
|    Sunny|      12000|
+---------+-----------+

+------------+-----------+
|  Department|avg(Salary)|
+------------+-----------+
|    Big Data|     3750.0|
|         IOT|     7500.0|
|Data Science|    10750.0|
+------------+-----------+



In [169]:
df.agg({'Salary':'max'}).show()

+-----------+
|max(Salary)|
+-----------+
|      20000|
+-----------+

