In [1]:
import os
import sys
import glob
os.environ['SPARK_HOME'] = 'C:\spark-3.1.2-bin-hadoop3.2'
os.environ['JAVA_HOME'] = 'C:\Program Files\Java\jdk1.8.0_201'
os.environ['HADOOP_HOME'] = 'C:\spark-3.1.2-bin-hadoop3.2'
spark_python = os.path.join(os.environ.get('SPARK_HOME',None),'python')
py4j = glob.glob(os.path.join(spark_python,'lib','py4j-*.zip'))[0]
sys.path[:0]=[spark_python,py4j]
os.environ['PYTHONPATH']=py4j
import findspark
findspark.init()
findspark.find()

'C:\\spark-3.1.2-bin-hadoop3.2'

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Basic Examples").getOrCreate()

In [3]:
df = spark.read.csv('F:/main course/Artificial_Neural_Networks/AppleStore.csv',inferSchema=True,header=True)

In [4]:
df.printSchema()
df.show(truncate=False)

root
 |-- size_bytes: long (nullable = true)
 |-- price: double (nullable = true)
 |-- rating_count_tot: integer (nullable = true)
 |-- rating_count_ver: integer (nullable = true)
 |-- user_rating_ver: double (nullable = true)
 |-- cont_rating: integer (nullable = true)
 |-- prime_genre: string (nullable = true)
 |-- sup_devices.num: integer (nullable = true)
 |-- ipadSc_urls.num: integer (nullable = true)
 |-- lang.num: integer (nullable = true)
 |-- vpp_lic: integer (nullable = true)
 |-- user_rating: double (nullable = true)

+----------+-----+----------------+----------------+---------------+-----------+-----------------+---------------+---------------+--------+-------+-----------+
|size_bytes|price|rating_count_tot|rating_count_ver|user_rating_ver|cont_rating|prime_genre      |sup_devices.num|ipadSc_urls.num|lang.num|vpp_lic|user_rating|
+----------+-----+----------------+----------------+---------------+-----------+-----------------+---------------+---------------+--------+------

In [5]:
df = df.select(['size_bytes','price','prime_genre','user_rating'])

In [6]:
df = df.limit(20)

In [17]:
# Cast operation 
from pyspark.sql.functions import col
from pyspark.sql.types import StringType
df_transformed = df.withColumn("price_new",col("price").cast(StringType()))

In [18]:
# Creating a UDF to left pad the string with '0s' in a column
from pyspark.sql.functions import udf
def leftPad(string):
    string = str(string).rjust(10,'0')
    return string

convertUDF = udf(lambda z: leftPad(z),StringType())

# Creating a UDF to change the values to Upper Case
convertUDF2 = udf(lambda z: str(z).upper(),StringType())

df_transformed = df_transformed.withColumn("price_new",convertUDF(col("price_new")).alias("price_new"))
df_transformed = df_transformed.withColumn("prime_genre",convertUDF2(col("prime_genre")).alias("prime_genre"))

In [19]:
# Substring and trim
from pyspark.sql.functions import substring, trim
df_transformed = df_transformed.withColumn("price_new",substring('price_new',6,4))
df_transformed = df_transformed.withColumn('prime_genre', trim('prime_genre'))

In [20]:
df_transformed.show()

+----------+-----+-----------------+-----------+---------+
|size_bytes|price|      prime_genre|user_rating|price_new|
+----------+-----+-----------------+-----------+---------+
| 100788224| 3.99|            GAMES|        4.0|     03.9|
| 158578688|  0.0|     PRODUCTIVITY|        4.0|     000.|
| 100524032|  0.0|          WEATHER|        3.5|     000.|
| 128512000|  0.0|         SHOPPING|        4.0|     000.|
|  92774400|  0.0|        REFERENCE|        4.5|     000.|
|  10485713| 0.99|            GAMES|        4.0|     00.9|
| 227795968|  0.0|          FINANCE|        4.0|     000.|
| 130242560|  0.0|            MUSIC|        4.0|     000.|
|  49250304| 9.99|        UTILITIES|        4.5|     09.9|
|  70023168| 3.99|            GAMES|        4.0|     03.9|
|  49618944| 4.99|            GAMES|        4.5|     04.9|
| 227547136| 7.99|            GAMES|        3.5|     07.9|
| 179979264|  0.0|        UTILITIES|        3.5|     000.|
| 160925696|  0.0|          FINANCE|        3.5|     000

# Analysis and Operations

In [23]:
# Count Distinct

from pyspark.sql.functions import countDistinct
df_distinct_count = df_transformed.select(countDistinct("prime_genre").alias("prime_genre_count"))
df_distinct_count.show()

+-----------------+
|prime_genre_count|
+-----------------+
|               10|
+-----------------+



In [26]:
# frequency, cummulative frequency and cumulative percentage

from pyspark.sql.functions import count
frequencies = df_transformed.groupBy('prime_genre').agg(
    count('prime_genre').alias('frequency')
).selectExpr(
    '*','100*Frequency / sum(Frequency) over() Percent'
).selectExpr('*',
            'sum(Frequency) over(order by Frequency desc) cumulative_frequency',
            'sum(Percent) over(order by Frequency desc) cumulative_Percent'
)
frequencies.show()

+-----------------+---------+-------+--------------------+------------------+
|      prime_genre|frequency|Percent|cumulative_frequency|cumulative_Percent|
+-----------------+---------+-------+--------------------+------------------+
|            GAMES|        7|   35.0|                   7|              35.0|
|        UTILITIES|        2|   10.0|                  15|              75.0|
|            MUSIC|        2|   10.0|                  15|              75.0|
|           TRAVEL|        2|   10.0|                  15|              75.0|
|          FINANCE|        2|   10.0|                  15|              75.0|
|          WEATHER|        1|    5.0|                  20|             100.0|
|SOCIAL NETWORKING|        1|    5.0|                  20|             100.0|
|        REFERENCE|        1|    5.0|                  20|             100.0|
|         SHOPPING|        1|    5.0|                  20|             100.0|
|     PRODUCTIVITY|        1|    5.0|                  20|      

In [27]:
# Create a dataframe showing count of null and not null values from a given column

from pyspark.sql.functions import isnan
null_values = df_transformed.filter(df_transformed.price.contains('None') | \
                                   df_transformed.price.contains('NULL') | \
                                   (col("price") == '') | \
                                   isnan(df_transformed.price) | \
                                   df_transformed.price.isNull()
                                   ).count()

Not_null_values = df_transformed.count() - null_values
data = [(null_values, Not_null_values)]
df_new = spark.createDataFrame(data,["Null_value","Not_Nul_values"])
df_new.show()

+----------+--------------+
|Null_value|Not_Nul_values|
+----------+--------------+
|         0|            20|
+----------+--------------+



In [28]:
# create two dataframes , one will have all the unique values and the other will have all the values which are dropped from the main dataframe
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
w2 = Window.partitionBy('prime_genre').orderBy(col("user_rating").desc())
df_duplicate = df_transformed.withColumn("row",row_number().over(w2)).filter(col("row")>1).drop("row")
df_unique = df_transformed.withColumn("row",row_number().over(w2)).filter(col("row")==1).drop("row")
df_duplicate.show()
df_unique.show()

+----------+-----+-----------+-----------+---------+
|size_bytes|price|prime_genre|user_rating|price_new|
+----------+-----+-----------+-----------+---------+
| 179979264|  0.0|  UTILITIES|        3.5|     000.|
|  55153664| 4.99|      GAMES|        4.5|     04.9|
| 100788224| 3.99|      GAMES|        4.0|     03.9|
|  10485713| 0.99|      GAMES|        4.0|     00.9|
|  70023168| 3.99|      GAMES|        4.0|     03.9|
|  10735026| 2.99|      GAMES|        4.0|     02.9|
| 227547136| 7.99|      GAMES|        3.5|     07.9|
| 147093504|  0.0|      MUSIC|        4.0|     000.|
| 167407616|  0.0|     TRAVEL|        4.0|     000.|
| 160925696|  0.0|    FINANCE|        3.5|     000.|
+----------+-----+-----------+-----------+---------+

+----------+-----+-----------------+-----------+---------+
|size_bytes|price|      prime_genre|user_rating|price_new|
+----------+-----+-----------------+-----------+---------+
| 100524032|  0.0|          WEATHER|        3.5|     000.|
|  49250304| 9.99|   