# Sudoku Quality Checks

In [None]:
import sys
sys.path.append("..")
from pyspark.sql import SparkSession, Row
import numpy as np
from helpers.data_prep_and_print import print_df

## Create Spark Session

In [None]:
spark = (SparkSession
       .builder
       .config("spark.executor.memory", "4g")
       .config("spark.driver.memory", "2g")
       .appName("Sudoku Quality Checks")
       .getOrCreate())
spark.sparkContext.setLogLevel("ERROR")

## Create a DataFrame using an ifered Schema 

In [None]:

path = "../sudoku_data/"
df = spark.read.option("header", "true") \
       .option("inferSchema", "true") \
       .csv(path+"sudoku_1_mio_sequenced_columns.csv")
#df = df.limit(10)
df.printSchema()

## All cells are filled => no Null values

In [None]:
df = df.na.drop()

## All numbers must be between 0 and 9 for the quizzes 

In [None]:
query_conditions = []
# Iterate over all quizzes col names 
for col_name in df.columns[:81]:
    query_conditions.append ("(" + col_name + " >= 0 ) and (" + col_name+ " <= 9 )")
df = df.filter(" and ".join(query_conditions))
#print_df (df)

## Solutions must contain numbers between 1 and 9

In [None]:
query_conditions = []
# Iterate over all col names with solutions 
for col_name in df.columns[81:]:
    query_conditions.append ("(" + col_name + " > 0 ) and (" + col_name + " <=9 )")
df = df.filter(" and ".join(query_conditions))

## All cells != 0 must have identical content

In [None]:
# Build Query
query_conditions = [] 
num_cols = int(len(df.columns)/2 )
for i in range(num_cols):
     query_conditions.append ("("+df.columns[i]+ " == 0 or " + df.columns[i] +" == "+df.columns[i+num_cols]+")")
df = df.filter(" and ".join(query_conditions))

## Checking Solutions: Checking all Rows and Columns

In [None]:
# Checking each row
#start = 81
#import pandas as pd
#from pyspark import Series
def is_in_range_and_unique(values:list, min_val:int, max_val:int)-> bool:
    """Check if the list given contains only unique elements"""
    if len(values) == len(set(values)):
        return min(values) == min_val and max(values)== max_val 
    return False

def check_unique_num_in_row_and_col(row: Row) -> bool:
    """Input: pd.Series, Output boolean"""
    begin_solution = 81
    for start in range(begin_solution,len(row),9):
        if not is_in_range_and_unique (row[start:start+9],1,9):
            return False
    for start in range(begin_solution,begin_solution+9):
        list_of_indexes = [*range (start,len(row),9)]
        list_of_values = [row[x] for x in list_of_indexes]
        if not is_in_range_and_unique (list_of_values,1,9):
            return False
    return True

df = df.rdd.filter(lambda x: check_unique_num_in_row_and_col(x)).toDF()
print(df.count())

## Checking Solutions: Checking all Squares

In [None]:
# check square
# there are nine 3x3 squares 
# the first square has the indexes (S1A, S1B, S1C, 
#                                   S2A, S2B, S2C,
#                                   S3A, S3B, S3C)

def get_square_definition ():
    """Returns the square positions of a 9*9 sodoku"""
    col_names = ["A","B","C","D","E","F","G","H","I"] 
    row_nums = [*range(1,10)]

    col_chunks = np.array_split(col_names, 3)
    row_chunks = np.array_split(row_nums, 3)
    square_pos=[]
    for curr_rows in row_chunks :
        for curr_cols in col_chunks:
            pos_names = []        
            for curr_row in curr_rows :
                for curr_col in curr_cols:
                    pos_names.append("S"+str(curr_row)+str(curr_col))
            square_pos.append(pos_names)
    return square_pos       

def check_squares(row:Row) -> bool:
    """Input=pd.Series Output = True/False"""
    for curr_square in get_square_definition() :
        list_of_values = [row[x] for x in curr_square] 
        if not is_in_range_and_unique(list_of_values,1,9):
            return False
    return True
    

df = df.rdd.filter(lambda x: check_squares(x)).toDF()
print(df.count())
print_df(df,10)