In [0]:
from pyspark.sql import SparkSession

In [0]:
spark = SparkSession.builder.appName('data_processing').getOrCreate()

In [0]:
df = spark.read.csv('dbfs:/FileStore/shared_uploads/harsh.sce21@sot.pdpu.ac.in/file.csv', inferSchema=True, header=True)

In [0]:
df.columns

Out[33]: ['ratings', 'age', 'experience', 'family', 'mobile']

In [0]:
df.show()

+-------+---+----------+-------+------------+
|ratings|age|experience| family|      mobile|
+-------+---+----------+-------+------------+
|    4.5| 30|         5|married|      iPhone|
|    3.8| 25|         3| single|     Android|
|    5.0| 35|         8|married|     Samsung|
|    4.2| 28|         6| single|Google Pixel|
|    4.7| 32|         7|married|      iPhone|
|    3.5| 22|         2| single|     OnePlus|
|    4.0| 27|         4|married|Google Pixel|
|    4.8| 33|         9| single|      iPhone|
|    3.9| 26|         5|married|     Samsung|
|    4.4| 29|         6| single|     Android|
|    4.6| 31|         8|married|     OnePlus|
|    3.7| 24|         3| single|Google Pixel|
|    4.9| 34|         7|married|      iPhone|
|    3.6| 23|         4| single|     Samsung|
|    4.3| 28|         6|married|     Android|
|    4.1| 30|         5| single|     OnePlus|
|    3.8| 25|         3|married|Google Pixel|
|    4.5| 32|         7| single|      iPhone|
|    4.2| 27|         4|married|  

In [0]:
len(df.columns)

Out[35]: 5

In [0]:
df.count()

Out[36]: 30

In [0]:
print(len(df.columns), df.count())

5 30


In [0]:
df.printSchema()

root
 |-- ratings: double (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- family: string (nullable = true)
 |-- mobile: string (nullable = true)



In [0]:
df.select('age', 'mobile').show(4)

+---+------------+
|age|      mobile|
+---+------------+
| 30|      iPhone|
| 25|     Android|
| 35|     Samsung|
| 28|Google Pixel|
+---+------------+
only showing top 4 rows



In [0]:
df.describe().show()

+-------+------------------+------------------+-----------------+-------+-------+
|summary|           ratings|               age|       experience| family| mobile|
+-------+------------------+------------------+-----------------+-------+-------+
|  count|                30|                30|               30|     30|     30|
|   mean|               4.2|28.533333333333335|5.466666666666667|   null|   null|
| stddev|0.4510524092187837|3.8122609214655463|1.995397001952819|   null|   null|
|    min|               3.4|                22|                2|married|Android|
|    max|               5.0|                35|                9| single| iPhone|
+-------+------------------+------------------+-----------------+-------+-------+



In [0]:
from pyspark.sql.types import StringType, DoubleType, IntegerType

In [0]:
df.withColumn("age_after_10_years", (df['age']+10)).show(10, False)

+-------+---+----------+-------+------------+------------------+
|ratings|age|experience|family |mobile      |age_after_10_years|
+-------+---+----------+-------+------------+------------------+
|4.5    |30 |5         |married|iPhone      |40                |
|3.8    |25 |3         |single |Android     |35                |
|5.0    |35 |8         |married|Samsung     |45                |
|4.2    |28 |6         |single |Google Pixel|38                |
|4.7    |32 |7         |married|iPhone      |42                |
|3.5    |22 |2         |single |OnePlus     |32                |
|4.0    |27 |4         |married|Google Pixel|37                |
|4.8    |33 |9         |single |iPhone      |43                |
|3.9    |26 |5         |married|Samsung     |36                |
|4.4    |29 |6         |single |Android     |39                |
+-------+---+----------+-------+------------+------------------+
only showing top 10 rows



In [0]:
df.withColumn("age_double", (df['age'].cast(DoubleType()))).show(10, False)

+-------+---+----------+-------+------------+----------+
|ratings|age|experience|family |mobile      |age_double|
+-------+---+----------+-------+------------+----------+
|4.5    |30 |5         |married|iPhone      |30.0      |
|3.8    |25 |3         |single |Android     |25.0      |
|5.0    |35 |8         |married|Samsung     |35.0      |
|4.2    |28 |6         |single |Google Pixel|28.0      |
|4.7    |32 |7         |married|iPhone      |32.0      |
|3.5    |22 |2         |single |OnePlus     |22.0      |
|4.0    |27 |4         |married|Google Pixel|27.0      |
|4.8    |33 |9         |single |iPhone      |33.0      |
|3.9    |26 |5         |married|Samsung     |26.0      |
|4.4    |29 |6         |single |Android     |29.0      |
+-------+---+----------+-------+------------+----------+
only showing top 10 rows



In [0]:
df.filter(df['mobile']=='iPhone').show()

+-------+---+----------+-------+------+
|ratings|age|experience| family|mobile|
+-------+---+----------+-------+------+
|    4.5| 30|         5|married|iPhone|
|    4.7| 32|         7|married|iPhone|
|    4.8| 33|         9| single|iPhone|
|    4.9| 34|         7|married|iPhone|
|    4.5| 32|         7| single|iPhone|
|    4.0| 26|         5| single|iPhone|
|    4.4| 28|         6|married|iPhone|
+-------+---+----------+-------+------+



In [0]:
df.filter(df['mobile']=='iPhone').select('age', 'family').show()

+---+-------+
|age| family|
+---+-------+
| 30|married|
| 32|married|
| 33| single|
| 34|married|
| 32| single|
| 26| single|
| 28|married|
+---+-------+



In [0]:
df.filter(df['mobile']=='iPhone').filter(df['experience'] > 5).show()

+-------+---+----------+-------+------+
|ratings|age|experience| family|mobile|
+-------+---+----------+-------+------+
|    4.7| 32|         7|married|iPhone|
|    4.8| 33|         9| single|iPhone|
|    4.9| 34|         7|married|iPhone|
|    4.5| 32|         7| single|iPhone|
|    4.4| 28|         6|married|iPhone|
+-------+---+----------+-------+------+



In [0]:
# df.filter(df['mobile']=='iPhone')&(filter(df['experience'] > 5)).show()

[0;31m---------------------------------------------------------------------------[0m
[0;31mTypeError[0m                                 Traceback (most recent call last)
File [0;32m<command-573613022675253>:1[0m
[0;32m----> 1[0m df[38;5;241m.[39mfilter(df[[38;5;124m'[39m[38;5;124mmobile[39m[38;5;124m'[39m][38;5;241m==[39m[38;5;124m'[39m[38;5;124miPhone[39m[38;5;124m'[39m) [38;5;241m&[39m ([38;5;28;43mfilter[39;49m[43m([49m[43mdf[49m[43m[[49m[38;5;124;43m'[39;49m[38;5;124;43mexperience[39;49m[38;5;124;43m'[39;49m[43m][49m[43m [49m[38;5;241;43m>[39;49m[43m [49m[38;5;241;43m5[39;49m[43m)[49m)[38;5;241m.[39mshow()

[0;31mTypeError[0m: filter expected 2 arguments, got 1

In [0]:
df.select('mobile').distinct().show()

+------------+
|      mobile|
+------------+
|      iPhone|
|     Samsung|
|     OnePlus|
|     Android|
|Google Pixel|
+------------+



In [0]:
df.groupBy('mobile').count().show(5, False)

+------------+-----+
|mobile      |count|
+------------+-----+
|iPhone      |7    |
|Samsung     |6    |
|OnePlus     |5    |
|Android     |6    |
|Google Pixel|6    |
+------------+-----+



In [0]:
df.groupBy('mobile').count().orderBy('count', ascending=False).show(5, False)

+------------+-----+
|mobile      |count|
+------------+-----+
|iPhone      |7    |
|Samsung     |6    |
|Android     |6    |
|Google Pixel|6    |
|OnePlus     |5    |
+------------+-----+



In [0]:
df.groupBy('mobile').sum().show(5, False)

+------------+------------------+--------+---------------+
|mobile      |sum(ratings)      |sum(age)|sum(experience)|
+------------+------------------+--------+---------------+
|iPhone      |31.799999999999997|215     |46             |
|Samsung     |24.199999999999996|165     |28             |
|OnePlus     |20.499999999999996|138     |26             |
|Android     |24.8              |171     |34             |
|Google Pixel|24.7              |167     |30             |
+------------+------------------+--------+---------------+



In [0]:
df.groupBy('mobile').mean().show(5, False)

+------------+-----------------+------------------+-----------------+
|mobile      |avg(ratings)     |avg(age)          |avg(experience)  |
+------------+-----------------+------------------+-----------------+
|iPhone      |4.542857142857143|30.714285714285715|6.571428571428571|
|Samsung     |4.033333333333332|27.5              |4.666666666666667|
|OnePlus     |4.1              |27.6              |5.2              |
|Android     |4.133333333333334|28.5              |5.666666666666667|
|Google Pixel|4.116666666666666|27.833333333333332|5.0              |
+------------+-----------------+------------------+-----------------+



In [0]:
df.groupBy('mobile').min().show(5, False)

+------------+------------+--------+---------------+
|mobile      |min(ratings)|min(age)|min(experience)|
+------------+------------+--------+---------------+
|iPhone      |4.0         |26      |5              |
|Samsung     |3.4         |23      |2              |
|OnePlus     |3.5         |22      |2              |
|Android     |3.6         |25      |3              |
|Google Pixel|3.7         |24      |3              |
+------------+------------+--------+---------------+



In [0]:
df.groupBy('mobile').max().show(5, False)

+------------+------------+--------+---------------+
|mobile      |max(ratings)|max(age)|max(experience)|
+------------+------------+--------+---------------+
|iPhone      |4.9         |34      |9              |
|Samsung     |5.0         |35      |8              |
|OnePlus     |4.6         |33      |8              |
|Android     |4.8         |35      |9              |
|Google Pixel|4.7         |32      |7              |
+------------+------------+--------+---------------+



In [0]:
df.groupBy('mobile').agg({'experience':'sum'}).show(5, False)

+------------+---------------+
|mobile      |sum(experience)|
+------------+---------------+
|iPhone      |46             |
|Samsung     |28             |
|OnePlus     |26             |
|Android     |34             |
|Google Pixel|30             |
+------------+---------------+

