# Pyspark Basics

In [2]:
#create pyspark session
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName('pyspark').getOrCreate()

In [3]:
#read the data file
df=spark.read.csv('conversion_data.csv')

In [4]:
df.show(5)

+-------+---+--------+------+-------------------+---------+
|    _c0|_c1|     _c2|   _c3|                _c4|      _c5|
+-------+---+--------+------+-------------------+---------+
|country|age|new_user|source|total_pages_visited|converted|
|     UK| 25|       1|   Ads|                  1|        0|
|     US| 23|       1|   Seo|                  5|        0|
|     US| 28|       1|   Seo|                  4|        0|
|  China| 39|       1|   Seo|                  5|        0|
+-------+---+--------+------+-------------------+---------+
only showing top 5 rows



In [5]:
#read the data file
df=spark.read.csv('conversion_data.csv',header=True)

In [6]:
df.show(5)

+-------+---+--------+------+-------------------+---------+
|country|age|new_user|source|total_pages_visited|converted|
+-------+---+--------+------+-------------------+---------+
|     UK| 25|       1|   Ads|                  1|        0|
|     US| 23|       1|   Seo|                  5|        0|
|     US| 28|       1|   Seo|                  4|        0|
|  China| 39|       1|   Seo|                  5|        0|
|     US| 30|       1|   Seo|                  6|        0|
+-------+---+--------+------+-------------------+---------+
only showing top 5 rows



In [7]:
df.printSchema()

root
 |-- country: string (nullable = true)
 |-- age: string (nullable = true)
 |-- new_user: string (nullable = true)
 |-- source: string (nullable = true)
 |-- total_pages_visited: string (nullable = true)
 |-- converted: string (nullable = true)



In [10]:
#statistical summary for data numerical columns
df.describe().show()

+-------+-------+------------------+-------------------+------+-------------------+-------------------+
|summary|country|               age|           new_user|source|total_pages_visited|          converted|
+-------+-------+------------------+-------------------+------+-------------------+-------------------+
|  count| 316200|            316200|             316200|316200|             316200|             316200|
|   mean|   null|30.569857685009488| 0.6854648956356736|  null|  4.872966476913346|0.03225806451612903|
| stddev|   null| 8.271801801807728|0.46433119036384723|  null|  3.341103757948214|0.17668497535763514|
|    min|  China|               111|                  0|   Ads|                  1|                  0|
|    max|     US|                79|                  1|   Seo|                  9|                  1|
+-------+-------+------------------+-------------------+------+-------------------+-------------------+



In [11]:
#acess dataframe column , we get column object 
df['country']

Column<b'country'>

In [12]:
type(df['country'])

pyspark.sql.column.Column

In [16]:
#access content of colum
df.select('country').show(5)

+-------+
|country|
+-------+
|     UK|
|     US|
|     US|
|  China|
|     US|
+-------+
only showing top 5 rows



In [17]:
#acess multiple columns
df.select(['country','source']).show(5)

+-------+------+
|country|source|
+-------+------+
|     UK|   Ads|
|     US|   Seo|
|     US|   Seo|
|  China|   Seo|
|     US|   Seo|
+-------+------+
only showing top 5 rows



### Add or Remove column 

#### using udf (user defined functions)

In [21]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf

def country_udf(country):
    if country =='UK':
        return 'Britain'
    elif country =='US':
        return 'USA'
    elif country =='China':
        return 'Asia'
    elif country =='Germany':
        return 'Deustche'
    else:
        return country
        
spark_udf = udf(country_udf, StringType())

df=df.withColumn("country_new", spark_udf(df.country))

In [22]:
df.show(10)

+-------+---+--------+------+-------------------+---------+-----------+
|country|age|new_user|source|total_pages_visited|converted|country_new|
+-------+---+--------+------+-------------------+---------+-----------+
|     UK| 25|       1|   Ads|                  1|        0|    Britain|
|     US| 23|       1|   Seo|                  5|        0|        USA|
|     US| 28|       1|   Seo|                  4|        0|        USA|
|  China| 39|       1|   Seo|                  5|        0|       Asia|
|     US| 30|       1|   Seo|                  6|        0|        USA|
|     US| 31|       0|   Seo|                  1|        0|        USA|
|  China| 27|       1|   Seo|                  4|        0|       Asia|
|     US| 23|       0|   Ads|                  4|        0|        USA|
|     UK| 29|       0|Direct|                  4|        0|    Britain|
|     US| 25|       0|   Ads|                  2|        0|        USA|
+-------+---+--------+------+-------------------+---------+-----

#### without using udf 

In [26]:
#create new column with age +2  value
df=df.withColumn('new_age',df['age'] +2)

In [27]:
df.show(10)

+-------+---+--------+------+-------------------+---------+-----------+-------+
|country|age|new_user|source|total_pages_visited|converted|country_new|new_age|
+-------+---+--------+------+-------------------+---------+-----------+-------+
|     UK| 25|       1|   Ads|                  1|        0|    Britain|   27.0|
|     US| 23|       1|   Seo|                  5|        0|        USA|   25.0|
|     US| 28|       1|   Seo|                  4|        0|        USA|   30.0|
|  China| 39|       1|   Seo|                  5|        0|       Asia|   41.0|
|     US| 30|       1|   Seo|                  6|        0|        USA|   32.0|
|     US| 31|       0|   Seo|                  1|        0|        USA|   33.0|
|  China| 27|       1|   Seo|                  4|        0|       Asia|   29.0|
|     US| 23|       0|   Ads|                  4|        0|        USA|   25.0|
|     UK| 29|       0|Direct|                  4|        0|    Britain|   31.0|
|     US| 25|       0|   Ads|           