In [24]:
import findspark

In [25]:
findspark.init()

In [26]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
import pandas as pd

In [27]:
spark = SparkSession.builder.getOrCreate()
#Enable eager evaluation
# Normally, you'd have to call df.show() to display the DataFrame.
# With eagerEval enabled, Spark will display the result automatically.

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)
type(spark)

pyspark.sql.session.SparkSession

In [28]:
df=spark.read.csv('artists.csv',header=True)
df.printSchema()
df.dtypes
df.count()

#converting pyspark dataframe to pandas dataframe 
df = df.toPandas()
df=spark.createDataFrame(df)
df.show()

root
 |-- id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- artist_popularity: string (nullable = true)
 |-- followers: string (nullable = true)
 |-- genre: string (nullable = true)

+--------------------+--------------------+-----------------+---------+--------------------+
|                  id|                name|artist_popularity|followers|               genre|
+--------------------+--------------------+-----------------+---------+--------------------+
|6GkSLJj5CGLIckLIb...|   Late Night Fights|                0|      184|    minneapolis punk|
|1N0VwQy5PRJymrRvl...|    Sub Par All Star|                0|      491|       antiviral pop|
|4Ch4BIKKwMJsiu1oV...|Nirvana Meditatio...|                0|       56|                  8d|
|5h2iINTOvhghVcpIz...|   Red Hot Rebellion|                0|      394|         action rock|
|7dE2MLL2SaI6MujpU...|Nirvana Tribute Band|                0|      230|             tribute|
|2cdjmWomWpKyQ0Hqg...|              Wyli P|          

In [29]:
# here filter acts as a where clause which is in SQL 
df.select("id")
df.select("id").filter("followers <200")



id
6GkSLJj5CGLIckLIb...
4Ch4BIKKwMJsiu1oV...
06hu6DFeD2bfdBAts...
3DnNVpLZI0vFQCYIi...
1o9N4cb5eZ5x2HLOO...
68D0NSkqx99hNYB26...
6pE6WjaGLjkatxCY7...
2SyevwE69Tq3LYIes...
34u51KUR8vtXeHTAy...
6n0xdBqSuPm9UcdOA...


In [30]:
# To execute SQL queries on DataFrames, Spark needs to know how to refer to the DataFrame within the SQL context.
#By using createOrReplaceTempView(), you can assign a name to the DataFrame that SQL queries can reference.
df.createOrReplaceTempView("artists")
spark.sql("select * from artists where followers < 200 order by followers")

id,name,artist_popularity,followers,genre
6FzRSeefoHkEu10zy...,2kuttfrmda4,0,0,
0KMWXoBLZ1zpXqIee...,India Loreto,15,0,
2O5hweLKIlxmKi7uq...,Brandi Doonan,24,0,
3LPF6Jl3IOJk6z5y1...,Dj Conuco,18,0,
0wh4mHyVDQjqMFNXJ...,DON DADA KE,0,0,
0aRDu4MCHCgwLQtnG...,Frank Tye,16,0,
0gD8udvIhqbta8jfO...,Breezy,26,0,
15jSp9iFAUv3yh3So...,Miguel ztr,16,0,
0bP0kTSAd6KMRiEk5...,Jorgie Francis,1,0,
2FyI02wZq4ueemjbT...,Mega Tron-XS,17,0,


In [31]:
df

id,name,artist_popularity,followers,genre
6GkSLJj5CGLIckLIb...,Late Night Fights,0,184,minneapolis punk
1N0VwQy5PRJymrRvl...,Sub Par All Star,0,491,antiviral pop
4Ch4BIKKwMJsiu1oV...,Nirvana Meditatio...,0,56,8d
5h2iINTOvhghVcpIz...,Red Hot Rebellion,0,394,action rock
7dE2MLL2SaI6MujpU...,Nirvana Tribute Band,0,230,tribute
2cdjmWomWpKyQ0Hqg...,Wyli P,0,7479,memphis hip hop
5g1ztx52qgchXwiRZ...,Caul,1,688,dark ambient
3NjmnKYEuN92YZiWV...,Chicago Blues All...,1,730,chicago blues
3hqLYpiTCdvz5lG3Y...,Lew Lewis & The T...,1,500,neo-rockabilly
06hu6DFeD2bfdBAts...,Lower Life Forms,1,169,deep east coast h...


In [32]:
df.select('artist_popularity').distinct().count()

94

In [33]:
df.describe('artist_popularity','followers').show()

+-------+------------------+------------------+
|summary| artist_popularity|         followers|
+-------+------------------+------------------+
|  count|             37012|             37012|
|   mean|15.440073489679023|222465.43015778667|
| stddev| 18.95825607271792| 2192633.202182601|
|    min|                 0|                 0|
|    max|                95|              9999|
+-------+------------------+------------------+



In [34]:
df.count()

37012

In [35]:
duplicate=df.dropDuplicates()

In [36]:
duplicate.count()

37012

In [37]:
df.groupby("artist_popularity").count()

artist_popularity,count
7,667
51,164
15,480
54,153
11,562
29,381
69,60
42,218
73,46
64,88


In [38]:
spark.sql("select artist_popularity,count(artist_popularity) as count from artists group by artist_popularity order by count desc")

artist_popularity,count
0,10093
1,2799
2,1581
3,1265
4,1009
5,856
6,801
7,667
8,642
9,574


In [39]:
spark.sql("select artist_popularity,count(artist_popularity) as count from artists group by artist_popularity order by count desc").show(100)

+-----------------+-----+
|artist_popularity|count|
+-----------------+-----+
|                0|10093|
|                1| 2799|
|                2| 1581|
|                3| 1265|
|                4| 1009|
|                5|  856|
|                6|  801|
|                7|  667|
|                8|  642|
|                9|  574|
|               11|  562|
|               10|  552|
|               13|  512|
|               12|  507|
|               16|  484|
|               15|  480|
|               14|  473|
|               18|  461|
|               22|  442|
|               17|  434|
|               21|  434|
|               23|  426|
|               20|  426|
|               26|  415|
|               19|  400|
|               28|  386|
|               29|  381|
|               25|  361|
|               27|  358|
|               24|  358|
|               32|  341|
|               30|  324|
|               31|  324|
|               34|  321|
|               33|  297|
|           

In [40]:
# Rename columns to uppercase using toDF
#toDF(): This method renames all columns at once, making it more efficient than looping with withColumnRenamed().
df = df.toDF(*[col.upper() for col in df.columns])
df

ID,NAME,ARTIST_POPULARITY,FOLLOWERS,GENRE
6GkSLJj5CGLIckLIb...,Late Night Fights,0,184,minneapolis punk
1N0VwQy5PRJymrRvl...,Sub Par All Star,0,491,antiviral pop
4Ch4BIKKwMJsiu1oV...,Nirvana Meditatio...,0,56,8d
5h2iINTOvhghVcpIz...,Red Hot Rebellion,0,394,action rock
7dE2MLL2SaI6MujpU...,Nirvana Tribute Band,0,230,tribute
2cdjmWomWpKyQ0Hqg...,Wyli P,0,7479,memphis hip hop
5g1ztx52qgchXwiRZ...,Caul,1,688,dark ambient
3NjmnKYEuN92YZiWV...,Chicago Blues All...,1,730,chicago blues
3hqLYpiTCdvz5lG3Y...,Lew Lewis & The T...,1,500,neo-rockabilly
06hu6DFeD2bfdBAts...,Lower Life Forms,1,169,deep east coast h...


In [41]:
df1 = df.withColumnRenamed("NAME", "name_1")

In [42]:
df

ID,NAME,ARTIST_POPULARITY,FOLLOWERS,GENRE
6GkSLJj5CGLIckLIb...,Late Night Fights,0,184,minneapolis punk
1N0VwQy5PRJymrRvl...,Sub Par All Star,0,491,antiviral pop
4Ch4BIKKwMJsiu1oV...,Nirvana Meditatio...,0,56,8d
5h2iINTOvhghVcpIz...,Red Hot Rebellion,0,394,action rock
7dE2MLL2SaI6MujpU...,Nirvana Tribute Band,0,230,tribute
2cdjmWomWpKyQ0Hqg...,Wyli P,0,7479,memphis hip hop
5g1ztx52qgchXwiRZ...,Caul,1,688,dark ambient
3NjmnKYEuN92YZiWV...,Chicago Blues All...,1,730,chicago blues
3hqLYpiTCdvz5lG3Y...,Lew Lewis & The T...,1,500,neo-rockabilly
06hu6DFeD2bfdBAts...,Lower Life Forms,1,169,deep east coast h...


In [48]:
df1

ID,name_1,ARTIST_POPULARITY,FOLLOWERS,GENRE
6GkSLJj5CGLIckLIb...,Late Night Fights,0,184,minneapolis punk
1N0VwQy5PRJymrRvl...,Sub Par All Star,0,491,antiviral pop
4Ch4BIKKwMJsiu1oV...,Nirvana Meditatio...,0,56,8d
5h2iINTOvhghVcpIz...,Red Hot Rebellion,0,394,action rock
7dE2MLL2SaI6MujpU...,Nirvana Tribute Band,0,230,tribute
2cdjmWomWpKyQ0Hqg...,Wyli P,0,7479,memphis hip hop
5g1ztx52qgchXwiRZ...,Caul,1,688,dark ambient
3NjmnKYEuN92YZiWV...,Chicago Blues All...,1,730,chicago blues
3hqLYpiTCdvz5lG3Y...,Lew Lewis & The T...,1,500,neo-rockabilly
06hu6DFeD2bfdBAts...,Lower Life Forms,1,169,deep east coast h...


In [43]:
type(df)

pyspark.sql.dataframe.DataFrame

In [44]:
type(df)

pyspark.sql.dataframe.DataFrame

In [45]:
df.explain(True)

== Parsed Logical Plan ==
Project [id#600 AS ID#1037, name#601 AS NAME#1038, artist_popularity#602 AS ARTIST_POPULARITY#1039, followers#603 AS FOLLOWERS#1040, genre#604 AS GENRE#1041]
+- LogicalRDD [id#600, name#601, artist_popularity#602, followers#603, genre#604], false

== Analyzed Logical Plan ==
ID: string, NAME: string, ARTIST_POPULARITY: string, FOLLOWERS: string, GENRE: string
Project [id#600 AS ID#1037, name#601 AS NAME#1038, artist_popularity#602 AS ARTIST_POPULARITY#1039, followers#603 AS FOLLOWERS#1040, genre#604 AS GENRE#1041]
+- LogicalRDD [id#600, name#601, artist_popularity#602, followers#603, genre#604], false

== Optimized Logical Plan ==
Project [id#600 AS ID#1037, name#601 AS NAME#1038, artist_popularity#602 AS ARTIST_POPULARITY#1039, followers#603 AS FOLLOWERS#1040, genre#604 AS GENRE#1041]
+- LogicalRDD [id#600, name#601, artist_popularity#602, followers#603, genre#604], false

== Physical Plan ==
*(1) Project [id#600 AS ID#1037, name#601 AS NAME#1038, artist_popu

In [47]:
df.rdd.getNumPartitions()

8

In [64]:
df_repartitioned = df.repartition(4)

In [71]:
df


ID,NAME,ARTIST_POPULARITY,FOLLOWERS,GENRE
6GkSLJj5CGLIckLIb...,Late Night Fights,0,184,minneapolis punk
1N0VwQy5PRJymrRvl...,Sub Par All Star,0,491,antiviral pop
4Ch4BIKKwMJsiu1oV...,Nirvana Meditatio...,0,56,8d
5h2iINTOvhghVcpIz...,Red Hot Rebellion,0,394,action rock
7dE2MLL2SaI6MujpU...,Nirvana Tribute Band,0,230,tribute
2cdjmWomWpKyQ0Hqg...,Wyli P,0,7479,memphis hip hop
5g1ztx52qgchXwiRZ...,Caul,1,688,dark ambient
3NjmnKYEuN92YZiWV...,Chicago Blues All...,1,730,chicago blues
3hqLYpiTCdvz5lG3Y...,Lew Lewis & The T...,1,500,neo-rockabilly
06hu6DFeD2bfdBAts...,Lower Life Forms,1,169,deep east coast h...


In [66]:
cores = spark.sparkContext.defaultParallelism

In [74]:
df.filter(col("NAME").contains('Sub')).show(10)

+--------------------+--------------------+-----------------+---------+-------------+
|                  ID|                NAME|ARTIST_POPULARITY|FOLLOWERS|        GENRE|
+--------------------+--------------------+-----------------+---------+-------------+
|1N0VwQy5PRJymrRvl...|    Sub Par All Star|                0|      491|antiviral pop|
|23rCpgXIJSrfDTLFX...|    Subhasish Biswas|                0|      441|         null|
|7Jn1Ryt0eNcsUCvrE...|                 Sub|                0|       40|         null|
|1diNjB59jGUi6J2ZO...|    Belinda Subraman|                0|       21|         null|
|1XG6XIRAkjOJl0K0b...|       Subzero Da Yg|                0|        7|         null|
|7Gi0KoSSJ0ayuOVP5...|       Subpar Snatch|                0|       97|  austin rock|
|14rkUH3L4ITLnabAK...|D. Sruthilaya Sub...|                1|       10|         null|
|4Bs9QDZQiv5rMjg0S...|      Javier Subatin|                2|      148|         null|
|5un018zl7VOOkAHm2...|      Subodh Goswami|           