In [50]:
import pyspark, os, sys
from pyspark.sql import *
from pyspark import SparkConf,SparkContext
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql import types

spark=SparkSession.builder.getOrCreate()

df=spark.read.format("csv").option("header",True).option("sep",",").option("inferSchema",True).load("googleplaystore.csv")
print("Before data cleaning")
df.show(1)     #To check first record
df.printSchema()

#To perform data cleaning , we will drop few columns which are not that relevant
print("After data cleaning")
df=df.drop("Reviews", "Content Rating", "Last Updated", "Android Ver")

df.show(1)

#If you want to change the datatypes for the columns based on their values, you can try as below

#Below we are removing sepcial charaters from values of 'Installs' column and we are changing datatype of 'Installs' column from StringType() to IntegerType()
#Also we are removing $ symbols from values of 'Price' column and we are changing datatype of 'Price' from StringType() to IntegerType()
df=df.withColumn("Installs",regexp_replace(col("Installs"),"^[0-9]",""))\
.withColumn("Installs",col("Installs").cast(IntegerType()))\
.withColumn("Price",regexp_replace(col("Price"),"[$]",""))\
.withColumn("Price",col("Price").cast(IntegerType()))

df.show()

df.printSchema()



Before data cleaning
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|                 App|      Category|Rating|Reviews|Size|Installs|Type|Price|Content Rating|      Genres|   Last Updated|Current Ver| Android Ver|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
|Photo Editor & Ca...|ART_AND_DESIGN|   4.1|    159| 19M| 10,000+|Free|    0|      Everyone|Art & Design|January 7, 2018|      1.0.0|4.0.3 and up|
+--------------------+--------------+------+-------+----+--------+----+-----+--------------+------------+---------------+-----------+------------+
only showing top 1 row

root
 |-- App: string (nullable = true)
 |-- Category: string (nullable = true)
 |-- Rating: string (nullable = true)
 |-- Reviews: string (nullable = true)
 |-- Size: string (nullable = true)
 |-- Installs: string (null

In [67]:
df=spark.read.format("csv").option("header",True).option("sep",",").option("inferSchema",True).load("googleplaystore.csv")

df=df.withColumn("Installs",col("Installs").cast(IntegerType()))
#to perform data operations using SQL, we need to create a view as below
df.createOrReplaceTempView("apps")

#to get top 10 Reviews for apps
#res=spark.sql("select App, sum(Reviews) AS sum_reviews from apps group by App order by sum_reviews desc limit 10")

#to get top 10 installs for apps
#res=spark.sql("select App, sum(Installs) as sum_installs from apps group by App order by sum_installs desc limit 10")

#to find category wise distribution of installed apps
#res=spark.sql("select Category, sum(Installs) as sum_installs from apps group by Category order by sum_installs desc")

#to find top paid apps
res=spark.sql("select App, sum(Price) as sum_price from apps where Type='Paid' group by App order by sum_price desc;")
res.show()

+--------------------+---------+
|                 App|sum_price|
+--------------------+---------+
|      I am Rich Plus|     NULL|
|Trine 2: Complete...|     NULL|
|Whoowasit? - Best...|     NULL|
|           AF-STROKE|     NULL|
|     Sokoban Land DX|     NULL|
|Servidor Privado ...|     NULL|
| Campervan.Guide Pro|     NULL|
| XCOM®: Enemy Within|     NULL|
|SweetLand — Famil...|     NULL|
|Medical ID - In C...|     NULL|
|Language Therapy:...|     NULL|
|The World Ends Wi...|     NULL|
|          iHunter BC|     NULL|
|Flipped Out! - Po...|     NULL|
|Dr. Panda Restaur...|     NULL|
|Al'Quran Bahasa I...|     NULL|
|         Eu Sou Rico|     NULL|
|         bpresso PRO|     NULL|
|CN Superstar Socc...|     NULL|
|            Dz kayas|     NULL|
+--------------------+---------+
only showing top 20 rows

