In [33]:
from pyspark.sql import SparkSession
import os
os.environ['PYARROW_IGNORE_TIMEZONE'] = '1'

import pyspark.pandas as ps
import pandas as pd

In [34]:
session = SparkSession.builder\
    .appName('pandas')\
    .config('spark.sql.ansi.enabled', 'false')\
    .config('PYARROW_IGNORE_TIMEZONE', '1')\
    .getOrCreate()

### Creating dataframe using pandas-on-spark

In [45]:
ps_df = ps.DataFrame({
    'name': ['Ruba', 'Ramanan', 'Thayalan', 'Others'],
    'age': [56, 57, 28, 22],
    'address': ['PPT', 'PPT', 'PPT', 'PPT'],
    'salary': [500000, 300000, 400000, 800000]
})

## adding 1 for age column
ps_df['age'] = ps_df['age'] + 1

ps_df.head()

Unnamed: 0,name,age,address,salary
0,Ruba,57,PPT,500000
1,Ramanan,58,PPT,300000
2,Thayalan,29,PPT,400000
3,Others,23,PPT,800000


In [46]:
ps_df.describe()

                                                                                

Unnamed: 0,age,salary
count,4.0,4.0
mean,41.75,500000.0
std,18.35529,216024.689947
min,23.0,300000.0
25%,23.0,300000.0
50%,29.0,400000.0
75%,57.0,500000.0
max,58.0,800000.0


In [37]:
ps_df.age.mean()

41.75

In [38]:
ps_df['salary_after_increment'] = ps_df['salary'] * 1.1
ps_df.head()

Unnamed: 0,Name,Age,address,salary,salary_after_increment
0,Ruba,57,PPT,500000,550000.0
1,Ramanan,58,PPT,300000,330000.0
2,Thayalan,29,PPT,400000,440000.0
3,Others,23,PPT,800000,880000.0


In [39]:
# elder than 30

filtered_df = ps_df[ps_df.age>30]
filtered_df.head()

Unnamed: 0,Name,Age,address,salary,salary_after_increment
0,Ruba,57,PPT,500000,550000.0
1,Ramanan,58,PPT,300000,330000.0


#### Do transformation for element-wise operations

In [47]:
ps_df['age_add'] = ps_df['age'].transform(lambda x: x+10)
ps_df.head()

                                                                                

Unnamed: 0,name,age,address,salary,age_add
0,Ruba,57,PPT,500000,67
1,Ramanan,58,PPT,300000,68
2,Thayalan,29,PPT,400000,39
3,Others,23,PPT,800000,33


In [48]:
def categorize_salary(salary):
    if salary >=500000:
        return "High"
    if salary > 300000:
        return "Medium"
    return "Low"

#### Do apply function for column

In [49]:
ps_df['salary_cat'] = ps_df.salary.apply(categorize_salary)
ps_df.head()

                                                                                

Unnamed: 0,name,age,address,salary,age_add,salary_cat
0,Ruba,57,PPT,500000,67,High
1,Ramanan,58,PPT,300000,68,Low
2,Thayalan,29,PPT,400000,39,Medium
3,Others,23,PPT,800000,33,High


In [53]:
def cat_salary(row):
    return categorize_salary(row['salary'])

#### Do apply function for column

In [54]:
ps_df['salary_catry'] = ps_df.apply(cat_salary, axis=1)
ps_df.head()



ValueError: Cannot combine the series or dataframe because it comes from a different dataframe. In order to allow this operation, enable 'compute.ops_on_diff_frames' option.

#### Convert to pandas-on-spark datafrom to spark dataframe

In [29]:
spark_df = filtered_df.to_spark()
spark_df.show()



+-------+---+-------+------+----------------------+
|   Name|Age|address|salary|salary_after_increment|
+-------+---+-------+------+----------------------+
|   Ruba| 57|    PPT|500000|              550000.0|
|Ramanan| 58|    PPT|300000|              330000.0|
+-------+---+-------+------+----------------------+



#### Convert back to psark dataframe to pandas-on-spark dataframe

In [30]:
df_c = ps.DataFrame(spark_df)
df_c.head()

Unnamed: 0,Name,Age,address,salary,salary_after_increment
0,Ruba,57,PPT,500000,550000.0
1,Ramanan,58,PPT,300000,330000.0


In [31]:
session.stop()