In [86]:
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("working_datasets").getOrCreate()

In [87]:
data1 = spark.read.csv("Marks_data.csv",header = True,inferSchema = True)

In [88]:
data1.show()

+----+--------+--------+----+
|Name|M1 Score|M2 Score| age|
+----+--------+--------+----+
|Alex|      62|      80|NULL|
|Brad|      45|      56|NULL|
|Joey|      85|      98|NULL|
|NULL|      54|      79|NULL|
|abhi|    NULL|    NULL|NULL|
+----+--------+--------+----+



In [89]:
# it will show the columns list from the csv file

data1.columns

['Name', 'M1 Score', 'M2 Score', 'age']

In [90]:
data1.head()

Row(Name='Alex', M1 Score=62, M2 Score=80, age=None)

In [91]:
type(data1)

pyspark.sql.dataframe.DataFrame

In [92]:
column_var = data1.select('name')
column_var.show()

+----+
|name|
+----+
|Alex|
|Brad|
|Joey|
|NULL|
|abhi|
+----+



In [93]:
multi_cols = data1.select(['name','M1 Score'])
multi_cols.show()

+----+--------+
|name|M1 Score|
+----+--------+
|Alex|      62|
|Brad|      45|
|Joey|      85|
|NULL|      54|
|abhi|    NULL|
+----+--------+



In [94]:
# reading text file in spark
# data2 = spark.read.text("extex.txt")
# data2.show()

In [95]:
# data3 = data2.describe().show()

In [96]:
# added both the columns and added the result in new column as total marks

add_col = data1.withColumn("total marks",data1['M1 Score']+data1['M2 Score']).show()
add_col

+----+--------+--------+----+-----------+
|Name|M1 Score|M2 Score| age|total marks|
+----+--------+--------+----+-----------+
|Alex|      62|      80|NULL|        142|
|Brad|      45|      56|NULL|        101|
|Joey|      85|      98|NULL|        183|
|NULL|      54|      79|NULL|        133|
|abhi|    NULL|    NULL|NULL|       NULL|
+----+--------+--------+----+-----------+



In [97]:
# dropping columns
#drop ("column_name")

remo_cols = data1.drop("total marks") # total makrs is dropped

remo_cols.show()


+----+--------+--------+----+
|Name|M1 Score|M2 Score| age|
+----+--------+--------+----+
|Alex|      62|      80|NULL|
|Brad|      45|      56|NULL|
|Joey|      85|      98|NULL|
|NULL|      54|      79|NULL|
|abhi|    NULL|    NULL|NULL|
+----+--------+--------+----+



In [98]:
# renaming columns

#withColumnRenamed("existing name","new name")
# print("before rename :")
# data1.show()
print("after rename:")
rename_col = data1.withColumnRenamed("name","full name")
rename_col.show()

after rename:
+---------+--------+--------+----+
|full name|M1 Score|M2 Score| age|
+---------+--------+--------+----+
|     Alex|      62|      80|NULL|
|     Brad|      45|      56|NULL|
|     Joey|      85|      98|NULL|
|     NULL|      54|      79|NULL|
|     abhi|    NULL|    NULL|NULL|
+---------+--------+--------+----+



In [99]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("working_datasets").getOrCreate()



In [100]:
data1.show()

+----+--------+--------+----+
|Name|M1 Score|M2 Score| age|
+----+--------+--------+----+
|Alex|      62|      80|NULL|
|Brad|      45|      56|NULL|
|Joey|      85|      98|NULL|
|NULL|      54|      79|NULL|
|abhi|    NULL|    NULL|NULL|
+----+--------+--------+----+



In [101]:
# let us drop the age column using drop()
#drop("column_name")
drop_col = data1.drop("age")
drop_col.show()

+----+--------+--------+
|Name|M1 Score|M2 Score|
+----+--------+--------+
|Alex|      62|      80|
|Brad|      45|      56|
|Joey|      85|      98|
|NULL|      54|      79|
|abhi|    NULL|    NULL|
+----+--------+--------+



In [102]:
data1.show()

+----+--------+--------+----+
|Name|M1 Score|M2 Score| age|
+----+--------+--------+----+
|Alex|      62|      80|NULL|
|Brad|      45|      56|NULL|
|Joey|      85|      98|NULL|
|NULL|      54|      79|NULL|
|abhi|    NULL|    NULL|NULL|
+----+--------+--------+----+



In [103]:
drop_col2 = data1.na.drop(how="all")
drop_col2.show()

+----+--------+--------+----+
|Name|M1 Score|M2 Score| age|
+----+--------+--------+----+
|Alex|      62|      80|NULL|
|Brad|      45|      56|NULL|
|Joey|      85|      98|NULL|
|NULL|      54|      79|NULL|
|abhi|    NULL|    NULL|NULL|
+----+--------+--------+----+



In [104]:
drop_col3 = data1.na.drop(how="any")
drop_col3.show()

+----+--------+--------+---+
|Name|M1 Score|M2 Score|age|
+----+--------+--------+---+
+----+--------+--------+---+



In [105]:
# in subset you can mention which column to check the null values.
m1_drop_col = data1.na.drop(how="any", subset=["M1 Score"])
m1_drop_col.show()

+----+--------+--------+----+
|Name|M1 Score|M2 Score| age|
+----+--------+--------+----+
|Alex|      62|      80|NULL|
|Brad|      45|      56|NULL|
|Joey|      85|      98|NULL|
|NULL|      54|      79|NULL|
+----+--------+--------+----+



In [106]:
# filling missing values
data1 = data1.na.fill('missing',['Name'])
data1.show()

+-------+--------+--------+----+
|   Name|M1 Score|M2 Score| age|
+-------+--------+--------+----+
|   Alex|      62|      80|NULL|
|   Brad|      45|      56|NULL|
|   Joey|      85|      98|NULL|
|missing|      54|      79|NULL|
|   abhi|    NULL|    NULL|NULL|
+-------+--------+--------+----+



In [113]:
#renaming columns
data1 = data1.withColumnRenamed("M1 Score","M1_score")
data1 = data1.withColumnRenamed("M2 Score",'M2_score')

In [131]:
#filling the missing values with mean, median and mode

from pyspark.ml.feature import Imputer

imputer = Imputer(inputCols=['M1_score','M2_score'],
                  outputCols=["{}_imputed".format(c) for c in ['M1_score','M2_score']]).setStrategy("mean")


In [132]:
# Add imputation cols to df
imputer.fit(data1).transform(data1).show()

+-------+--------+--------+----+----------------+----------------+
|   Name|M1_score|M2_score| age|M1_score_imputed|M2_score_imputed|
+-------+--------+--------+----+----------------+----------------+
|   Alex|      62|      80|NULL|              62|              80|
|   Brad|      45|      56|NULL|              45|              56|
|   Joey|      85|      98|NULL|              85|              98|
|missing|      54|      79|NULL|              54|              79|
|   abhi|    NULL|    NULL|NULL|              61|              78|
+-------+--------+--------+----+----------------+----------------+



In [133]:
data1.show()

+-------+--------+--------+----+
|   Name|M1_score|M2_score| age|
+-------+--------+--------+----+
|   Alex|      62|      80|NULL|
|   Brad|      45|      56|NULL|
|   Joey|      85|      98|NULL|
|missing|      54|      79|NULL|
|   abhi|    NULL|    NULL|NULL|
+-------+--------+--------+----+



In [139]:
# filter operations
#using and (&), or (|) , not (~)

filter_cols = data1.filter(data1["M1_score"]>55)
filter_cols.show()

+----+--------+--------+----+
|Name|M1_score|M2_score| age|
+----+--------+--------+----+
|Alex|      62|      80|NULL|
|Joey|      85|      98|NULL|
+----+--------+--------+----+



In [143]:
filter_cols2 = data1.filter((data1["M1_score"]>=55) & (data1["M2_score"]>=55))
filter_cols2.show()

+----+--------+--------+----+
|Name|M1_score|M2_score| age|
+----+--------+--------+----+
|Alex|      62|      80|NULL|
|Joey|      85|      98|NULL|
+----+--------+--------+----+



In [145]:
# not (~) condition whcih will inverse the condition
filter_cols3 = data1.filter(~(data1["M1_score"]>=55))
filter_cols3.show()

+-------+--------+--------+----+
|   Name|M1_score|M2_score| age|
+-------+--------+--------+----+
|   Brad|      45|      56|NULL|
|missing|      54|      79|NULL|
+-------+--------+--------+----+



In [155]:
# group by

groupby_cols = data1.groupBy("name").max("M1_score").show()

+-------+-------------+-------------+
|   name|max(M1_score)|max(M2_score)|
+-------+-------------+-------------+
|   Joey|           85|           98|
|   abhi|         NULL|         NULL|
|   Alex|           62|           80|
|missing|           54|           79|
|   Brad|           45|           56|
+-------+-------------+-------------+



In [152]:
groupby_cols2 = data1.groupBy("age").count()
groupby_cols2.show()

+---+-----+
|age|count|
+---+-----+
| 19|    1|
| 20|    3|
| 21|    1|
+---+-----+



In [154]:
groupby_cols3 = data1.agg({"M1_score":"sum"})
groupby_cols3.show()

+-------------+
|sum(M1_score)|
+-------------+
|          246|
+-------------+



In [161]:
# group by

groupby_cols = data1.groupBy('name').max().show()

+-------+-------------+-------------+
|   name|max(M1_score)|max(M2_score)|
+-------+-------------+-------------+
|   Joey|           85|           98|
|   abhi|         NULL|         NULL|
|   Alex|           62|           80|
|missing|           54|           79|
|   Brad|           45|           56|
+-------+-------------+-------------+



In [163]:
groupby_cols = data1.groupBy('age').avg().show()


+---+-------------+-------------+
|age|avg(M1_score)|avg(M2_score)|
+---+-------------+-------------+
| 19|         45.0|         56.0|
| 20|         58.0|         79.5|
| 21|         85.0|         98.0|
+---+-------------+-------------+

