In [1]:
import pyspark
from pyspark.sql import Row
from pyspark.sql import SparkSession
import pandas as pd

In [2]:
print(pyspark.__version__)

3.3.2


In [3]:
spark = SparkSession.builder \
    .appName("example2") \
    .getOrCreate()

In [4]:
path = r"C:\Users\Admin\Downloads\result.csv"
df = spark.read.csv(path, header=True, inferSchema=True)
# df['x'] = df['FolderName']

In [5]:
df = df.withColumn('x', df['FolderName'])

In [6]:
df.show()

+--------------+--------+--------------+
|    FolderName|RULE_OCR|             x|
+--------------+--------+--------------+
|BIDV0047188303|       0|BIDV0047188303|
|BIDV0059278463|       0|BIDV0059278463|
|BIDV0064071491|       0|BIDV0064071491|
|BIDV0118393062|       1|BIDV0118393062|
|BIDV0148809380|       0|BIDV0148809380|
|BIDV0149952476|       0|BIDV0149952476|
|BIDV0151606912|       0|BIDV0151606912|
|BIDV0169855511|       0|BIDV0169855511|
|BIDV0174373890|       0|BIDV0174373890|
|BIDV0175874531|       0|BIDV0175874531|
|BIDV0176336562|       0|BIDV0176336562|
|BIDV0201400044|       1|BIDV0201400044|
|BIDV0258028258|       0|BIDV0258028258|
|BIDV0277818578|       0|BIDV0277818578|
|BIDV0283350889|       0|BIDV0283350889|
|BIDV0297175289|       1|BIDV0297175289|
|BIDV0389631253|       0|BIDV0389631253|
|BIDV0399860941|       0|BIDV0399860941|
|BIDV0416341762|       0|BIDV0416341762|
|BIDV0432482374|       0|BIDV0432482374|
+--------------+--------+--------------+
only showing top

In [47]:
df.filter(df['x'] == 'BIDV0047188303').show()

+--------------+--------+--------------+
|    FolderName|RULE_OCR|             x|
+--------------+--------+--------------+
|BIDV0047188303|       0|BIDV0047188303|
+--------------+--------+--------------+



In [46]:
from pyspark.sql.functions import col
df.filter(col('x') == 'BIDV0047188303').show()

+--------------+--------+--------------+
|    FolderName|RULE_OCR|             x|
+--------------+--------+--------------+
|BIDV0047188303|       0|BIDV0047188303|
+--------------+--------+--------------+



In [56]:
def fun_keep_vnd_currency(df, cus_id_col, currency_col, currency_value):
    '''This function removes rows which have made transaction in other than VND currency. 
    For Ex: if currency_value is USD then we will remove the data for that customer.'''

    # Select rows with only VND currencies
    tbl = df.filter(df[currency_col]==currency_value)
    # Select the customers which have VND currency transaction
    lst = df.select(cus_id_col).distinct()
    # Filter the customers not from list above
    tbl1 = df.join(lst, df[cus_id_col] == lst[cus_id_col], "left_anti")
    tbl1.persist()
    print('Number of customers made transactions in others currencies: ', tbl1.select(cus_id_col).distinct().count())
    
    return tbl

In [7]:
df1 = df[['FolderName', 'x']]

In [10]:
df1.fillna(0).orderBy('x')

DataFrame[FolderName: string, x: string]

In [23]:
df1.drop(*['FolderName', 'x'])

DataFrame[]

In [12]:
from pyspark.sql.functions import udf
from pyspark.sql.types import IntegerType

def func(x):
    if len(x) > 1:
        return 0
    else:
        return 1

# Đăng ký hàm UDF với PySpark
udf_aaa = udf(lambda x:func(x), IntegerType())

# Áp dụng hàm UDF vào cột 'x' và lưu kết quả vào cột 'lenX'
df1 = df1.withColumn('lenX', udf_aaa(df1['x']))

In [13]:
df1.show()

+--------------+--------------+----+
|    FolderName|             x|lenX|
+--------------+--------------+----+
|BIDV0047188303|BIDV0047188303|   0|
|BIDV0059278463|BIDV0059278463|   0|
|BIDV0064071491|BIDV0064071491|   0|
|BIDV0118393062|BIDV0118393062|   0|
|BIDV0148809380|BIDV0148809380|   0|
|BIDV0149952476|BIDV0149952476|   0|
|BIDV0151606912|BIDV0151606912|   0|
|BIDV0169855511|BIDV0169855511|   0|
|BIDV0174373890|BIDV0174373890|   0|
|BIDV0175874531|BIDV0175874531|   0|
|BIDV0176336562|BIDV0176336562|   0|
|BIDV0201400044|BIDV0201400044|   0|
|BIDV0258028258|BIDV0258028258|   0|
|BIDV0277818578|BIDV0277818578|   0|
|BIDV0283350889|BIDV0283350889|   0|
|BIDV0297175289|BIDV0297175289|   0|
|BIDV0389631253|BIDV0389631253|   0|
|BIDV0399860941|BIDV0399860941|   0|
|BIDV0416341762|BIDV0416341762|   0|
|BIDV0432482374|BIDV0432482374|   0|
+--------------+--------------+----+
only showing top 20 rows



In [32]:
from pyspark.sql.functions import col, when
df1 = df1.withColumn('MARITAL_GROUP', when(col('x').isin(['MARRIED', 'SINGLE']), col('x')).otherwise('OTHER'))

In [49]:
df1.show()

+--------------+--------------+-------------+
|    FolderName|             x|MARITAL_GROUP|
+--------------+--------------+-------------+
|BIDV0047188303|BIDV0047188303|        OTHER|
|BIDV0059278463|BIDV0059278463|        OTHER|
|BIDV0064071491|BIDV0064071491|        OTHER|
|BIDV0118393062|BIDV0118393062|        OTHER|
|BIDV0148809380|BIDV0148809380|        OTHER|
|BIDV0149952476|BIDV0149952476|        OTHER|
|BIDV0151606912|BIDV0151606912|        OTHER|
|BIDV0169855511|BIDV0169855511|        OTHER|
|BIDV0174373890|BIDV0174373890|        OTHER|
|BIDV0175874531|BIDV0175874531|        OTHER|
|BIDV0176336562|BIDV0176336562|        OTHER|
|BIDV0201400044|BIDV0201400044|        OTHER|
|BIDV0258028258|BIDV0258028258|        OTHER|
|BIDV0277818578|BIDV0277818578|        OTHER|
|BIDV0283350889|BIDV0283350889|        OTHER|
|BIDV0297175289|BIDV0297175289|        OTHER|
|BIDV0389631253|BIDV0389631253|        OTHER|
|BIDV0399860941|BIDV0399860941|        OTHER|
|BIDV0416341762|BIDV0416341762|   