In [90]:
import locale
import numpy as npa
import pandas as pd

import pytz
from datetime import datetime

In [91]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName('ThExam').getOrCreate()

In [92]:
local_timezone = pytz.timezone('Asia/Bangkok')

In [93]:
execute_time = datetime.now(local_timezone)
year = execute_time.year
month = execute_time.month
day = execute_time.day

print(execute_time)
print("year:{} | month={} | day={}".format(year,month,day))

2024-03-02 20:57:12.291099+07:00
year:2024 | month=3 | day=2


In [94]:
TABLE = 'dailycheckins'

In [95]:
schema = StructType([
    StructField("user",StringType(),True),
    StructField("timestamp",TimestampType(),True),
    StructField("hours",FloatType(),True),
    StructField("project",StringType(),True),
  ])

In [96]:
path = "../data/clean_dailycheckins/"
df = spark.read.schema(schema).csv(path, header=True)

In [97]:
df.printSchema()

root
 |-- user: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- hours: float (nullable = true)
 |-- project: string (nullable = true)



In [98]:
df.sample(False, 0.1, seed=0).show(10, truncate=False)

+--------+-------------------+-----+-----------+
|user    |timestamp          |hours|project    |
+--------+-------------------+-----+-----------+
|ned     |2019-09-26 00:00:00|4.0  |bizdev     |
|jaime   |2018-12-20 07:00:00|1.5  |project-00 |
|catelyn |2018-11-26 21:47:36|0.08 |security   |
|daenerys|2018-11-26 19:55:30|8.0  |blogideas  |
|jaime   |2018-11-26 19:11:58|0.5  |opsandadmin|
|ned     |2018-11-26 19:08:09|1.5  |transit    |
|jon     |2018-11-26 18:49:04|2.5  |project-25 |
|sansa   |2018-11-26 18:48:33|2.5  |project-51 |
|bran    |2018-11-26 07:00:00|4.37 |project-31 |
|catelyn |2018-11-23 20:12:16|1.1  |engineering|
+--------+-------------------+-----+-----------+
only showing top 10 rows



In [99]:
df.describe().toPandas()

Unnamed: 0,summary,user,hours,project
0,count,20495,20500.0,20500
1,mean,,2.071274634053794,
2,stddev,,2.004774748229288,
3,min,arya,0.0,airflow
4,max,ygritte,30.0,workshops


In [100]:
def find_missing(df):
    string_columns = ['user', 'project', 'timestamp']
    numeric_columns = ['hours']
    missing_values = {}
    for index, column in enumerate(df.columns):
        missing_count = df.filter(col(column).eqNullSafe(None) | col(column).isNull() | col(column).isin([None])).count()
        missing_values.update({column:missing_count})
    return missing_values

In [101]:
missing_dict = find_missing(df)

In [113]:
has_failed_test = False
missing_data = {}

In [115]:
for key, value in missing_dict.items():
    if value != 0:
        has_failed_test = True
        values = df.filter(col(str(key)).eqNullSafe(None) | col(str(key)).isNull() | col(str(key)).isin([None]))
        missing_data[key] = values

In [116]:
missing_data

{'user': DataFrame[user: string, timestamp: timestamp, hours: float, project: string]}

In [117]:
# update status, data and log of error file to Dynamo 
# update noti by SNS or API to somewhere 

for key in missing_data:
    missing_data[key].write\
    .option("header","true")\
    .mode('overwrite').csv("../state=test/table={}/year={}/month={}/day={}/".format(TABLE, year, month, day))

In [118]:
has_failed_test

True

In [123]:
missing_data['user'].sample(False, 0.5, seed=0).show(10, truncate=False)

+----+-------------------+-----+----------+
|user|timestamp          |hours|project   |
+----+-------------------+-----+----------+
|null|2017-10-12 17:31:44|2.75 |project-47|
|null|2017-10-12 17:31:44|4.0  |bizdev    |
+----+-------------------+-----+----------+



In [75]:
data = [("Hello, World!",)]
df = spark.createDataFrame(data, ["message"])

# Show the DataFrame
df.show(truncate=False)

+-------------+
|message      |
+-------------+
|Hello, World!|
+-------------+

