In [1]:
!pip install pyspark



In [2]:
import pyspark

In [3]:
import pandas as pd
pd.read_csv('test1.csv')

Unnamed: 0,Name,Age
0,PR,22
1,CS,35
2,JX,23
3,Tony,23


In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [6]:
spark

In [7]:
df_pyspark = spark.read.csv('test1.csv',header = True, inferSchema = True)

In [8]:
type(df_pyspark)

pyspark.sql.dataframe.DataFrame

In [9]:
df_pyspark.head(3)

[Row(Name='PR', Age=22), Row(Name='CS', Age=35), Row(Name='JX', Age=23)]

In [10]:
df_pyspark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)



In [11]:
df_pyspark.columns

['Name', 'Age']

In [12]:
df_pyspark.show()

+----+---+
|Name|Age|
+----+---+
|  PR| 22|
|  CS| 35|
|  JX| 23|
|Tony| 23|
+----+---+



In [13]:
df_pyspark.select(['Name','Age']).show()

+----+---+
|Name|Age|
+----+---+
|  PR| 22|
|  CS| 35|
|  JX| 23|
|Tony| 23|
+----+---+



In [14]:
df_pyspark.dtypes

[('Name', 'string'), ('Age', 'int')]

In [15]:
df_pyspark.describe().show()

+-------+----+-----------------+
|summary|Name|              Age|
+-------+----+-----------------+
|  count|   4|                4|
|   mean|null|            25.75|
| stddev|null|6.184658438426491|
|    min|  CS|               22|
|    max|Tony|               35|
+-------+----+-----------------+



In [16]:
### Add columns in data frame
df_pyspark = df_pyspark.withColumn('Age After 2 years',df_pyspark['Age'] + 2)

In [17]:
df_pyspark.show()

+----+---+-----------------+
|Name|Age|Age After 2 years|
+----+---+-----------------+
|  PR| 22|               24|
|  CS| 35|               37|
|  JX| 23|               25|
|Tony| 23|               25|
+----+---+-----------------+



In [18]:
### Drop the columns
df_pyspark = df_pyspark.drop('Age after 2 years')

In [19]:
df_pyspark.show()

+----+---+
|Name|Age|
+----+---+
|  PR| 22|
|  CS| 35|
|  JX| 23|
|Tony| 23|
+----+---+



In [20]:
### Rename the columns
df_pyspark.withColumnRenamed('Name','New Name').show()

+--------+---+
|New Name|Age|
+--------+---+
|      PR| 22|
|      CS| 35|
|      JX| 23|
|    Tony| 23|
+--------+---+



In [21]:
### Handling missing values
df_spark1 = spark.read.csv('test2.csv',header = True, inferSchema = True)

In [22]:
df_spark1.show()

+-----+----+----+------+
|Name | Age| Exp|Salary|
+-----+----+----+------+
|   PR|  31|  10| 20000|
|   JX|  23|   2| 10000|
| TONY|  23|   3| 40000|
|   CS|  21|   6| 40000|
|   KW|  33|   7| 20000|
|   SZ|null|null| 70000|
| null|  34|   8| 80000|
| null|  35|null|  null|
+-----+----+----+------+



In [23]:
df_spark1.na.drop().show()

+-----+---+---+------+
|Name |Age|Exp|Salary|
+-----+---+---+------+
|   PR| 31| 10| 20000|
|   JX| 23|  2| 10000|
| TONY| 23|  3| 40000|
|   CS| 21|  6| 40000|
|   KW| 33|  7| 20000|
+-----+---+---+------+



In [24]:
### all==how
df_spark1.na.drop(how = 'all').show()

+-----+----+----+------+
|Name | Age| Exp|Salary|
+-----+----+----+------+
|   PR|  31|  10| 20000|
|   JX|  23|   2| 10000|
| TONY|  23|   3| 40000|
|   CS|  21|   6| 40000|
|   KW|  33|   7| 20000|
|   SZ|null|null| 70000|
| null|  34|   8| 80000|
| null|  35|null|  null|
+-----+----+----+------+



In [25]:
### how == any
df_spark1.na.drop(how = 'any').show()

+-----+---+---+------+
|Name |Age|Exp|Salary|
+-----+---+---+------+
|   PR| 31| 10| 20000|
|   JX| 23|  2| 10000|
| TONY| 23|  3| 40000|
|   CS| 21|  6| 40000|
|   KW| 33|  7| 20000|
+-----+---+---+------+



In [26]:
### Threshhold
df_spark1.na.drop(how = 'any',thresh = 2).show()

+-----+----+----+------+
|Name | Age| Exp|Salary|
+-----+----+----+------+
|   PR|  31|  10| 20000|
|   JX|  23|   2| 10000|
| TONY|  23|   3| 40000|
|   CS|  21|   6| 40000|
|   KW|  33|   7| 20000|
|   SZ|null|null| 70000|
| null|  34|   8| 80000|
+-----+----+----+------+



In [27]:
### Subset
df_spark1.na.drop(how = 'any', subset = ['Age']).show()

+-----+---+----+------+
|Name |Age| Exp|Salary|
+-----+---+----+------+
|   PR| 31|  10| 20000|
|   JX| 23|   2| 10000|
| TONY| 23|   3| 40000|
|   CS| 21|   6| 40000|
|   KW| 33|   7| 20000|
| null| 34|   8| 80000|
| null| 35|null|  null|
+-----+---+----+------+



In [28]:
df_spark1.na.fill('2',['Exp','Age']).show()

+-----+----+----+------+
|Name | Age| Exp|Salary|
+-----+----+----+------+
|   PR|  31|  10| 20000|
|   JX|  23|   2| 10000|
| TONY|  23|   3| 40000|
|   CS|  21|   6| 40000|
|   KW|  33|   7| 20000|
|   SZ|null|null| 70000|
| null|  34|   8| 80000|
| null|  35|null|  null|
+-----+----+----+------+



In [29]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age','Exp','Salary'],
    outputCols = ['{}_imputed'.format(c) for c in ['Age','Exp','Salary']]
).setStrategy('mean')

In [30]:
imputer.fit(df_spark1).transform(df_spark1).show()

+-----+----+----+------+-----------+-----------+--------------+
|Name | Age| Exp|Salary|Age_imputed|Exp_imputed|Salary_imputed|
+-----+----+----+------+-----------+-----------+--------------+
|   PR|  31|  10| 20000|         31|         10|         20000|
|   JX|  23|   2| 10000|         23|          2|         10000|
| TONY|  23|   3| 40000|         23|          3|         40000|
|   CS|  21|   6| 40000|         21|          6|         40000|
|   KW|  33|   7| 20000|         33|          7|         20000|
|   SZ|null|null| 70000|         28|          6|         70000|
| null|  34|   8| 80000|         34|          8|         80000|
| null|  35|null|  null|         35|          6|         40000|
+-----+----+----+------+-----------+-----------+--------------+



In [31]:
df_spark2 = spark.read.csv('test3.csv',header = True, inferSchema = True)

In [32]:
df_spark2.show()

+----+---+---+------+
|Name|Age|Exp|Salary|
+----+---+---+------+
|  PR| 31| 10| 20000|
|  JX| 23|  2| 10000|
|TONY| 23|  3| 40000|
|  CS| 21|  6| 40000|
|  KW| 33|  7| 20000|
|  SZ| 29|  8| 70000|
+----+---+---+------+



In [33]:
df_spark2.filter((df_spark2['Salary'] <= 20000) |
                 (df_spark2['Salary'] >= 40000)
                ).show()

+----+---+---+------+
|Name|Age|Exp|Salary|
+----+---+---+------+
|  PR| 31| 10| 20000|
|  JX| 23|  2| 10000|
|TONY| 23|  3| 40000|
|  CS| 21|  6| 40000|
|  KW| 33|  7| 20000|
|  SZ| 29|  8| 70000|
+----+---+---+------+

