## Installing PySpark

In [4]:
#%pip install pyspark

## Loading PySpaRK And Starting

In [5]:
import pyspark

In [6]:
import pandas as pd
pd.read_csv('Dataset/Test/test1.csv')

Unnamed: 0,Name,age,Experience
0,Krish,31,10
1,Sudhanshu,30,8
2,Sunny,29,4


In [7]:
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder.appName('Practice').getOrCreate()

In [9]:
spark

### Reading Dataset

In [10]:
#df_spark = spark.read.csv('Dataset/Test/Test/test1.csv')

## without Inferschema, all data will be of String datatype

#df_spark = spark.read.option('header','true').csv('Dataset/Test/test1.csv',inferSchema = True)
df_spark = spark.read.csv('Dataset/Test/test1.csv',inferSchema = True,header=True)


In [11]:
df_spark.show()

+---------+---+----------+
|     Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



In [12]:
type(df_spark)

## Dataframe is a datastructure in which various actions can be performed\

pyspark.sql.dataframe.DataFrame

In [13]:
# Check Schema
df_spark.printSchema()

root
 |-- Name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- Experience: integer (nullable = true)



### Select

In [14]:
### Get columns
df_spark.columns

['Name', 'age', 'Experience']

In [15]:
### Get top 2 rows
df_spark.head(2)

#output is returned as a list , where as in pandas it is returned as dataframe

[Row(Name='Krish', age=31, Experience=10),
 Row(Name='Sudhanshu', age=30, Experience=8)]

In [16]:
### Get columns 
df_spark.select('Name')

df_spark.select(['Name','Experience'])

## Return type : DataFrame

DataFrame[Name: string, Experience: int]

In [17]:
### using this will return a column

df_spark['Name']

Column<'Name'>

### Check Datatype

In [18]:
df_spark.dtypes

[('Name', 'string'), ('age', 'int'), ('Experience', 'int')]

### Describe

In [19]:
df_spark.describe().show()

+-------+-----+----+-----------------+
|summary| Name| age|       Experience|
+-------+-----+----+-----------------+
|  count|    3|   3|                3|
|   mean| NULL|30.0|7.333333333333333|
| stddev| NULL| 1.0|3.055050463303893|
|    min|Krish|  29|                4|
|    max|Sunny|  31|               10|
+-------+-----+----+-----------------+



### Adding and Removing

In [20]:
### Adding columns

df_spark = df_spark.withColumn('Experience after 2 years', df_spark['Experience']+2)

In [21]:
df_spark

DataFrame[Name: string, age: int, Experience: int, Experience after 2 years: int]

In [22]:
### Dropping columns

df_spark = df_spark.drop('Experience after 2 years')

In [23]:
### Renaming columns

df_spark = df_spark.withColumnRenamed('Name','New Name')

In [24]:
df_spark.show()

+---------+---+----------+
| New Name|age|Experience|
+---------+---+----------+
|    Krish| 31|        10|
|Sudhanshu| 30|         8|
|    Sunny| 29|         4|
+---------+---+----------+



## Dealing with Mising Values

In [25]:
df2_spark = spark.read.csv('Dataset/Test/test2.csv',header=True,inferSchema=True)

In [26]:
df2_spark.show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



### Removing rows with null 

In [27]:
df2_spark.na.drop().show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         2| 18000|
+---------+---+----------+------+



In [28]:
df2_spark.na.drop(how='all').show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [29]:
df2_spark.na.drop(how='any').show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         2| 18000|
+---------+---+----------+------+



In [30]:
df2_spark.na.drop(how="any",thresh=2).show()

## max 2 null values must be present

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
+---------+----+----------+------+



In [31]:
df2_spark.na.drop(how="any",subset=['Name']).show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
+---------+----+----------+------+



In [32]:
df2_spark.na.fill('Missing Values','Experience').show()

+---------+----+----------+------+
|     Name| Age|Experience|Salary|
+---------+----+----------+------+
|    Krish|  31|        10| 30000|
|Sudhanshu|  30|         8| 25000|
|    Sunny|  29|         4| 20000|
|     Paul|  24|         3| 20000|
|   Harsha|  21|         2| 18000|
|   Mahesh|NULL|      NULL| 40000|
|     NULL|  34|        10| 38000|
|     NULL|  36|      NULL|  NULL|
+---------+----+----------+------+



In [33]:
from pyspark.ml.feature import Imputer

imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("mean")

In [34]:
imputer.fit(df2_spark).transform(df2_spark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         2| 18000|         21|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         29|                 6|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 6|         27285|
+---------+----+----------+------+-----------+------------------+--------------+



In [35]:
imputer = Imputer(
    inputCols = ['Age','Experience','Salary'],
    outputCols = ["{}_imputed".format(c) for c in ['Age','Experience','Salary']]
).setStrategy("median")
imputer.fit(df2_spark).transform(df2_spark).show()

+---------+----+----------+------+-----------+------------------+--------------+
|     Name| Age|Experience|Salary|Age_imputed|Experience_imputed|Salary_imputed|
+---------+----+----------+------+-----------+------------------+--------------+
|    Krish|  31|        10| 30000|         31|                10|         30000|
|Sudhanshu|  30|         8| 25000|         30|                 8|         25000|
|    Sunny|  29|         4| 20000|         29|                 4|         20000|
|     Paul|  24|         3| 20000|         24|                 3|         20000|
|   Harsha|  21|         2| 18000|         21|                 2|         18000|
|   Mahesh|NULL|      NULL| 40000|         30|                 4|         40000|
|     NULL|  34|        10| 38000|         34|                10|         38000|
|     NULL|  36|      NULL|  NULL|         36|                 4|         25000|
+---------+----+----------+------+-----------+------------------+--------------+



## Filtering

In [36]:
df3 = spark.read.csv('Dataset/Test/test3.csv',header=True)
df3.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



### Filter Operations

In [37]:
# Salary of the people of less than 20000
df3.filter("Salary <= 20000").show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [38]:
df3.filter("Salary <= 20000").select(['Name','Age']).show()

+-------+---+
|   Name|Age|
+-------+---+
|  Sunny| 29|
|   Paul| 24|
| Harsha| 21|
|Shubham| 23|
+-------+---+



In [39]:
df3.filter(df3['Salary']<=20000).show()

+-------+---+----------+------+
|   Name|Age|Experience|Salary|
+-------+---+----------+------+
|  Sunny| 29|         4| 20000|
|   Paul| 24|         3| 20000|
| Harsha| 21|         1| 15000|
|Shubham| 23|         2| 18000|
+-------+---+----------+------+



In [40]:
df3.filter((df3['Salary']<=20000) &
        (df3['Experience']>2)).show()

+-----+---+----------+------+
| Name|Age|Experience|Salary|
+-----+---+----------+------+
|Sunny| 29|         4| 20000|
| Paul| 24|         3| 20000|
+-----+---+----------+------+



## Group By and Aggregate

In [41]:
df4 = spark.read.csv('Dataset/Test/test4.csv',header=True,inferSchema=True)

In [42]:
df4.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Departments: string (nullable = true)
 |-- Salary: integer (nullable = true)



In [43]:
df4.show()

+---------+------------+------+
|     Name| Departments|Salary|
+---------+------------+------+
|    Krish|Data Science| 10000|
|    Krish|         IOT|  5000|
|   Mahesh|    Big Data|  4000|
|    Krish|    Big Data|  4000|
|   Mahesh|Data Science|  3000|
|Sudhanshu|Data Science| 20000|
|Sudhanshu|         IOT| 10000|
|Sudhanshu|    Big Data|  5000|
|    Sunny|Data Science| 10000|
|    Sunny|    Big Data|  2000|
+---------+------------+------+



In [44]:
## Group by

## grouped to find max salary
df4.groupBy('Name').sum().show()

+---------+-----------+
|     Name|sum(Salary)|
+---------+-----------+
|Sudhanshu|      35000|
|    Sunny|      12000|
|    Krish|      19000|
|   Mahesh|       7000|
+---------+-----------+



In [45]:
## group by department giving max salary

df4.groupBy('Departments').sum().show()

+------------+-----------+
| Departments|sum(Salary)|
+------------+-----------+
|         IOT|      15000|
|    Big Data|      15000|
|Data Science|      43000|
+------------+-----------+



In [46]:
df4.groupBy('Departments').mean().show()

+------------+-----------+
| Departments|avg(Salary)|
+------------+-----------+
|         IOT|     7500.0|
|    Big Data|     3750.0|
|Data Science|    10750.0|
+------------+-----------+



In [47]:
df4.groupBy('Departments').count().show()

+------------+-----+
| Departments|count|
+------------+-----+
|         IOT|    2|
|    Big Data|    4|
|Data Science|    4|
+------------+-----+



In [48]:
df4.agg({'Salary':'sum'}).show()

+-----------+
|sum(Salary)|
+-----------+
|      73000|
+-----------+



## Spark ML

In [49]:
training = spark.read.csv('Dataset/Test/test3.csv',header=True,inferSchema=True)

In [50]:
training.show()

+---------+---+----------+------+
|     Name|Age|Experience|Salary|
+---------+---+----------+------+
|    Krish| 31|        10| 30000|
|Sudhanshu| 30|         8| 25000|
|    Sunny| 29|         4| 20000|
|     Paul| 24|         3| 20000|
|   Harsha| 21|         1| 15000|
|  Shubham| 23|         2| 18000|
+---------+---+----------+------+



In [51]:
training.printSchema()

root
 |-- Name: string (nullable = true)
 |-- Age: integer (nullable = true)
 |-- Experience: integer (nullable = true)
 |-- Salary: integer (nullable = true)



In [52]:
training.columns

['Name', 'Age', 'Experience', 'Salary']

In [55]:
from pyspark.ml.feature import VectorAssembler
featureassembler = VectorAssembler(inputCols=["Age","Experience"],outputCol="Independent Features")

In [56]:
output = featureassembler.transform(training)

In [57]:
output.show()

+---------+---+----------+------+--------------------+
|     Name|Age|Experience|Salary|Independent Features|
+---------+---+----------+------+--------------------+
|    Krish| 31|        10| 30000|         [31.0,10.0]|
|Sudhanshu| 30|         8| 25000|          [30.0,8.0]|
|    Sunny| 29|         4| 20000|          [29.0,4.0]|
|     Paul| 24|         3| 20000|          [24.0,3.0]|
|   Harsha| 21|         1| 15000|          [21.0,1.0]|
|  Shubham| 23|         2| 18000|          [23.0,2.0]|
+---------+---+----------+------+--------------------+



In [59]:
finalized_data = output.select("Independent Features","Salary")

In [61]:
finalized_data.show()

+--------------------+------+
|Independent Features|Salary|
+--------------------+------+
|         [31.0,10.0]| 30000|
|          [30.0,8.0]| 25000|
|          [29.0,4.0]| 20000|
|          [24.0,3.0]| 20000|
|          [21.0,1.0]| 15000|
|          [23.0,2.0]| 18000|
+--------------------+------+



In [70]:
from pyspark.ml.regression import LinearRegression
## train test split
train_data,test_data = finalized_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='Independent Features',labelCol='Salary')
regressor=regressor.fit(train_data)

In [71]:
#Coefficients
regressor.coefficients

DenseVector([-518.2482, 2094.8905])

In [72]:
# Intercepts
regressor.intercept

24605.839416054776

In [73]:
# Predictions
pred_results =regressor.evaluate(test_data)

In [74]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent Features|Salary|        prediction|
+--------------------+------+------------------+
|          [24.0,3.0]| 20000| 18452.55474452559|
|          [29.0,4.0]| 20000|17956.204379562776|
+--------------------+------+------------------+



In [75]:
pred_results.meanAbsoluteError,pred_results.meanAbsoluteError

(1795.6204379558167, 1795.6204379558167)