In [1]:
from pyspark.sql.functions import *
from pyspark.sql import *
import pyspark
from pyspark.sql.types import *

In [2]:
spark = SparkSession.builder.appName("SQL_Options").\
config("spark.driver.memory","4g").\
config("spark.executor.memory","4g").getOrCreate()

In [3]:
path = "./googleplaystore.csv"

In [4]:
dataFrame = spark.read.csv(path,inferSchema=True,header=True)

In [5]:
dataFrame.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: string (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [6]:
dataFrame.limit(5).toPandas()

Unnamed: 0,App,Category,Rating,Reviews,Size,Installs,Type,Price,Content Rating,Genres,Last Updated,Current Ver,Android Ver
0,Photo Editor & Candy Camera & Grid & ScrapBook,ART_AND_DESIGN,4.1,159,19M,"10,000+",Free,0,Everyone,Art & Design,"January 7, 2018",1.0.0,4.0.3 and up
1,Coloring book moana,ART_AND_DESIGN,3.9,967,14M,"500,000+",Free,0,Everyone,Art & Design;Pretend Play,"January 15, 2018",2.0.0,4.0.3 and up
2,"U Launcher Lite – FREE Live Cool Themes, Hide ...",ART_AND_DESIGN,4.7,87510,8.7M,"5,000,000+",Free,0,Everyone,Art & Design,"August 1, 2018",1.2.4,4.0.3 and up
3,Sketch - Draw & Paint,ART_AND_DESIGN,4.5,215644,25M,"50,000,000+",Free,0,Teen,Art & Design,"June 8, 2018",Varies with device,4.2 and up
4,Pixel Draw - Number Art Coloring Book,ART_AND_DESIGN,4.3,967,2.8M,"100,000+",Free,0,Everyone,Art & Design;Creativity,"June 20, 2018",1.1,4.4 and up


In [7]:
dataFrameSchemaChanged = dataFrame.withColumn("Rating", dataFrame["Rating"].cast(FloatType())).\
withColumn("Reviews", dataFrame["Reviews"].cast(IntegerType())).\
withColumn("Price", dataFrame["Price"].cast(IntegerType()))

In [8]:
dataFrameSchemaChanged.printSchema()

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: float (nullable = true)
 |-- Reviews: integer (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (nullable = true)
 |-- Type: string (nullable = true)
 |-- Price: integer (nullable = true)
 |-- Content Rating: string (nullable = true)
 |-- Genres: string (nullable = true)
 |-- Last Updated: string (nullable = true)
 |-- Current Ver: string (nullable = true)
 |-- Android Ver: string (nullable = true)



In [9]:
dataFrameSchemaChanged.createOrReplaceTempView("TempView")

In [10]:
df1 = spark.sql("select * from TempView where Rating > 4.1")

In [11]:
df1.select("App","Rating").where(df1.Category == "COMICS").where(df1.Rating > 4.5).show(5,False)

+--------------------------------------------------------------------+------+
|App                                                                 |Rating|
+--------------------------------------------------------------------+------+
|Manga Master - Best manga & comic reader                            |4.6   |
|GANMA! - All original stories free of charge for all original comics|4.7   |
|Röhrich Werner Soundboard                                           |4.7   |
|Unicorn Pokez - Color By Number                                     |4.8   |
|Manga - read Thai translation                                       |4.6   |
+--------------------------------------------------------------------+------+
only showing top 5 rows



In [12]:
df2 = spark.sql("select Category, count(*) as cnt from TempView group by Category")

In [59]:
dataFrame.select("Rating","Category").groupBy("Category").agg(count("Rating").alias("Sum")).\
orderBy(col("Sum").desc()).show(40,truncate=False)

+-------------------+----+
|Category           |Sum |
+-------------------+----+
|FAMILY             |1972|
|GAME               |1144|
|TOOLS              |843 |
|MEDICAL            |463 |
|BUSINESS           |460 |
|PRODUCTIVITY       |424 |
|PERSONALIZATION    |392 |
|COMMUNICATION      |387 |
|SPORTS             |384 |
|LIFESTYLE          |382 |
|FINANCE            |366 |
|HEALTH_AND_FITNESS |340 |
|PHOTOGRAPHY        |335 |
|SOCIAL             |295 |
|NEWS_AND_MAGAZINES |283 |
|SHOPPING           |260 |
|TRAVEL_AND_LOCAL   |258 |
|DATING             |234 |
|BOOKS_AND_REFERENCE|231 |
|VIDEO_PLAYERS      |175 |
|EDUCATION          |156 |
|ENTERTAINMENT      |149 |
|MAPS_AND_NAVIGATION|136 |
|FOOD_AND_DRINK     |127 |
|HOUSE_AND_HOME     |88  |
|AUTO_AND_VEHICLES  |85  |
|LIBRARIES_AND_DEMO |85  |
|WEATHER            |82  |
|ART_AND_DESIGN     |65  |
|EVENTS             |64  |
|PARENTING          |60  |
|COMICS             |60  |
|BEAUTY             |53  |
|Face               |1   |
|

In [27]:
dataFrame.select("Category").distinct().show(40)                                        

+-------------------+
|           Category|
+-------------------+
|             EVENTS|
|             COMICS|
|             SPORTS|
|            WEATHER|
|      VIDEO_PLAYERS|
|  AUTO_AND_VEHICLES|
|          PARENTING|
|      ENTERTAINMENT|
|    PERSONALIZATION|
| HEALTH_AND_FITNESS|
|   TRAVEL_AND_LOCAL|
|BOOKS_AND_REFERENCE|
|     FOOD_AND_DRINK|
|        PHOTOGRAPHY|
|           BUSINESS|
|             FAMILY|
|           SHOPPING|
|     HOUSE_AND_HOME|
|               GAME|
|               Face|
|                1.9|
|          EDUCATION|
|       PRODUCTIVITY|
|              TOOLS|
|          LIFESTYLE|
| NEWS_AND_MAGAZINES|
|            MEDICAL|
|      COMMUNICATION|
|MAPS_AND_NAVIGATION|
|             DATING|
|       traffic jams|
| LIBRARIES_AND_DEMO|
|             BEAUTY|
|             SOCIAL|
|            FINANCE|
|     ART_AND_DESIGN|
+-------------------+



In [37]:
dataFrame.select("App","Rating").where((col("Category")=="COMICS")&(col("Rating")>4.5)).show()

+---------------------+------+
|                  App|Rating|
+---------------------+------+
| Manga Master - Be...|   4.6|
| GANMA! - All orig...|   4.7|
| Röhrich Werner So...|   4.7|
| Unicorn Pokez - C...|   4.8|
| Manga - read Thai...|   4.6|
| Dragon Ball Wallp...|   4.7|
| Children's cartoo...|   4.6|
|【Ranobbe complete...|   NaN|
|       Faustop Sounds|   4.7|
| Hojiboy Tojiboyev...|   5.0|
| Best Wallpapers B...|   4.7|
| Laftel - Watching...|   4.6|
|            WebComics|   4.8|
| Superheroes, Marv...|   5.0|
|       Pepsi Cards DC|   NaN|
+---------------------+------+



In [62]:
dataFrame.select("App").where(col("App").contains("dating")).show(5,False)

+--------------------------------------------------+
|App                                               |
+--------------------------------------------------+
|Meet, chat & date. Free dating app - Chocolate app|
|Friend Find: free chat + flirt dating app         |
|Spine- The dating app                             |
|Princess Closet : Otome games free dating sim     |
|happn – Local dating app                          |
+--------------------------------------------------+



In [69]:
dataFrame.select("Genres").distinct().show(50,False)

+-----------------------------------+
|Genres                             |
+-----------------------------------+
|Video Players & Editors;Creativity |
|Adventure;Action & Adventure       |
|Education                          |
|Trivia                             |
|Art & Design;Action & Adventure    |
|Auto & Vehicles                    |
|Travel & Local;Action & Adventure  |
|Simulation;Action & Adventure      |
|Education;Pretend Play             |
|Tools;Education                    |
|Entertainment                      |
|Education;Education                |
|Entertainment;Music & Video        |
|Parenting;Brain Games              |
|Simulation;Pretend Play            |
|Arcade;Action & Adventure          |
|Educational;Creativity             |
|Arcade;Pretend Play                |
|Casual;Music & Video               |
|Adventure                          |
|Arcade                             |
|Books & Reference;Creativity       |
|Education;Brain Games              |
|Entertainme

In [75]:
dataFrame.select("Genres").groupBy("Genres").agg(count("Genres").alias("CNT")).sort(col("CNT").desc()).show(50,truncate=False)

+---------------------------+---+
|Genres                     |CNT|
+---------------------------+---+
|Tools                      |842|
|Entertainment              |623|
|Education                  |549|
|Medical                    |463|
|Business                   |460|
|Productivity               |424|
|Sports                     |398|
|Personalization            |392|
|Communication              |387|
|Lifestyle                  |381|
|Finance                    |366|
|Action                     |365|
|Health & Fitness           |340|
|Photography                |335|
|Social                     |295|
|News & Magazines           |283|
|Shopping                   |260|
|Travel & Local             |257|
|Dating                     |234|
|Books & Reference          |231|
|Arcade                     |220|
|Simulation                 |200|
|Casual                     |193|
|Video Players & Editors    |173|
|Puzzle                     |140|
|Maps & Navigation          |136|
|Food & Drink 

In [77]:
dataFrame.select("App").where((col("Genres")=='Tools')&(col("Reviews")>100)).count()

535

In [None]:
dataFrame.write.partitionBy