# 1. Data Validation

In [None]:
# Basic imports

import json
from pathlib import Path
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from dotenv import load_dotenv
load_dotenv('.env')
import os
import plotly.graph_objects as go

In [5]:
# Build SparkSession
spark = SparkSession.builder.appName("DataPreparation").getOrCreate()
base_path = os.getenv('BASE_PATH')

## Data Ingestion

In [None]:
# Data to read - offers.csv
data_dir_offers = f'{base_path}/offers.csv.gz'
data_file_offers = data_dir_offers

! head $data_file_offers

# Data to read - sampleSubmission.csv
data_dir_sampleSubmission = f'{base_path}/sampleSubmission.csv.gz'
data_file_sampleSubmission = data_dir_sampleSubmission

! head $data_file_sampleSubmission

# Data to read - testHistory.csv
data_dir_testHistory = f'{base_path}/testHistory.csv.gz'
data_file_testHistory = data_dir_testHistory

! head $data_file_testHistory

# Data to read - trainHistory.csv
data_dir_trainHistory = f'{base_path}/trainHistory.csv.gz'
data_file_trainHistory = data_dir_trainHistory

! head $data_file_trainHistory

# Data to read - transactions.csv
data_dir_transactions = f'{base_path}/transactions.csv.gz'
data_file_transactions = data_dir_transactions

! head $data_file_transactions;

OFFERS

In [None]:
# Reading data - offers.csv
df_offers = spark.read.csv(
        data_file_offers, 
        header=True, sep=',', inferSchema=True
    )

In [None]:
# Checking data that has been read - testHistory.csv
print(f'df_offers - number of rows: {df_offers.count()}')
df_offers.printSchema()
df_offers.show(10, truncate=False)

TESTHISTORY

In [None]:
# Reading data - testHistory.csv
df_testHistory = spark.read.csv(
        data_file_testHistory, 
        header=True, sep=',', inferSchema=True
    )

In [None]:
# Checking data that has been read - testHistory.csv
print(f'df_testHistory - number of rows: {df_testHistory.count()}')
df_testHistory.printSchema()
df_testHistory.show(10, truncate=False)

TRAINHISTORY

In [None]:
# Reading data - trainHistory.csv
df_trainHistory = spark.read.csv(
        data_file_trainHistory, 
        header=True, sep=',', inferSchema=True
    )

In [None]:
# Checking data that has been read - trainHistory.csv
print(f'df_trainHistory - number of rows: {df_trainHistory.count()}')
df_trainHistory.printSchema()
df_trainHistory.show(10, truncate=False)

TRANSACTIONS

In [None]:
df_transactions = spark.read.csv(
    data_file_transactions, 
    header=True, sep=',', inferSchema=True
).sample(fraction=0.001, seed=42).limit(1000000)

In [None]:
# Checking a sample of the transactions data
print(f'df_transactions - number of rows: {df_transactions.count()}')
df_transactions.printSchema()
df_transactions.show(10, truncate=False)

## Data Validation and Profiling

**Checking for duplicates and nulls**

In [None]:
print(f'df_offers - number of rows is {df_offers.count()}; after dropDuplicates() applied would be {df_offers.dropDuplicates().count()}.')                                  # offers.csv
print(f'df_testHistory - number of rows is {df_testHistory.count()}; after dropDuplicates() applied would be {df_testHistory.dropDuplicates().count()}.')                   # testHistory.csv
print(f'df_trainHistory - number of rows is {df_trainHistory.count()}; after dropDuplicates() applied would be {df_trainHistory.dropDuplicates().count()}.')                # trainHistory.csv
print(f'df_transactions - number of rows is {df_transactions.count()}; after dropDuplicates() applied would be {df_transactions.dropDuplicates().count()}.')                # transactions.csv

In [None]:
print(f'''df_offers - number of rows after dropna(how='any') applied would be {df_offers.dropna(how='any').count()}.''')                        # offers.csv
print(f'''df_testHistory - number of rows after dropna(how='any') applied would be {df_testHistory.dropna(how='any').count()}.''')              # testHistory.csv
print(f'''df_trainHistory - number of rows after dropna(how='any') applied would be {df_trainHistory.dropna(how='any').count()}.''')            # trainHistory.csv
print(f'''df_transactions - number of rows after dropna(how='any') applied would be {df_transactions.dropna(how='any').count()}.''')            # transactions.csv

**Generating Html files with full report for each data**

OFFERS

In [None]:
# offers.csv
from ydata_profiling import ProfileReport

profile_title_offers = 'offers.csv'

profile_report = ProfileReport(
    df_offers,
    title=profile_title_offers,
    infer_dtypes=False,
    interactions=None,
    missing_diagrams=None,
    correlations={
        "auto": {"calculate": False},
        "pearson": {"calculate": False},
        "spearman": {"calculate": False},
    },
)

In [None]:
# offers.csv
profile_report_file = data_dir_offers + 'profile-' + profile_title_offers + '.html'
profile_report.to_file(Path(profile_report_file))
profile_report_file

TESTHISTORY

In [None]:
# testHistory.csv
profile_title_testHistory = 'testHistory.csv'

profile_report = ProfileReport(
    df_testHistory.toPandas(),
    title=profile_title_testHistory,
    infer_dtypes=False,
    interactions=None,
    missing_diagrams=None,
    correlations={
        "auto": {"calculate": False},
        "pearson": {"calculate": False},
        "spearman": {"calculate": False},
    },
)

In [None]:
# testHistory.csv
profile_report_file = data_dir_testHistory + 'profile-' + profile_title_testHistory + '.html'
profile_report.to_file(Path(profile_report_file))
profile_report_file

TRAINHISTORY

In [None]:
# trainHistory.csv
profile_title_trainHistory = 'trainHistory.csv'

profile_report = ProfileReport(
    df_trainHistory.toPandas(),
    title=profile_title_trainHistory,
    infer_dtypes=False,
    interactions=None,
    missing_diagrams=None,
    correlations={
        "auto": {"calculate": False},
        "pearson": {"calculate": False},
        "spearman": {"calculate": False},
    },
)

In [None]:
# trainHistory.csv
profile_report_file = data_dir_trainHistory + 'profile-' + profile_title_trainHistory + '.html'
profile_report.to_file(Path(profile_report_file))
profile_report_file

TRANSACTIONS

In [None]:
# transactions.csv
from ydata_profiling import ProfileReport

profile_title_transactions = 'transactions.csv'

profile_report = ProfileReport(
    df_transactions.toPandas(),
    title=profile_title_transactions,
    infer_dtypes=False,
    interactions=None,
    missing_diagrams=None,
    correlations={
        "auto": {"calculate": False},
        "pearson": {"calculate": False},
        "spearman": {"calculate": False},
    },
)

In [None]:
# transactions.csv
profile_report_file = data_dir_transactions + 'profile-' + profile_title_transactions + '.html'
profile_report.to_file(Path(profile_report_file))
profile_report_file