# Imports

In [None]:
from constants import PX_OUTPUT_FOLDER

import math
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import seaborn as sns
from scipy import stats

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

DESCRIBE_PERCENTILES = [0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 0.99]

# Data Tables

In [None]:
tables = os.listdir(PX_OUTPUT_FOLDER)
print(len(tables))
#tables

In [None]:
count = 0
problems = os.listdir('./sat_problems/trial_problems/')
for problem in problems:
    if problem[:-4] + '.csv' not in tables:
        print(problem)
        count += 1
        
count

In [None]:
crafted_problems = [file.replace('.lzma', '')  for file in os.listdir('./sat_problems/sc14-crafted') if file[0:2] != '._']
crafted_problems 

In [None]:
application_problems = [file.replace('.lzma', '')  for file in os.listdir('./sat_problems/sc14-app') if file[0:2] != '._']
application_problems

In [None]:
len(crafted_problems), len(application_problems), len(crafted_problems) + len(application_problems) 

# Aggregate Results

In [None]:
all_data = pd.DataFrame()

for file in os.listdir(PX_OUTPUT_FOLDER):
    
    table = pd.read_csv(os.path.join(PX_OUTPUT_FOLDER, file), index_col=0)
    cnf_filename = file[:-4] + '.cnf'
    
    table['trial'] = table.index.astype(int)
    table['problem'] = cnf_filename
    table['problem_type'] = 'crafted' if cnf_filename in crafted_problems else 'application'
    
    all_data = all_data.append(table, ignore_index=True)
    
all_data = all_data[['problem', 'trial'] + all_data.columns.drop(['trial', 'problem']).to_list()]

for col in ['n', 'm', 'k', 'e', 'c', 'n_H', 'e_H', 'q']:
    all_data[f'{col} / {col}*'] = all_data[col] / all_data[f'{col}*']
    
all_data['P1_UNSAT'] = all_data['m'] - all_data['P1_score']
all_data['P2_UNSAT'] = all_data['m'] - all_data['P2_score']
all_data['P1%SAT'] = all_data['P1_score'] / all_data['m']
all_data['P2%SAT'] = all_data['P2_score'] / all_data['m']
all_data['NS%SAT'] = all_data['NS_score'] / all_data['m']
all_data['NS*%SAT'] = all_data['NS_score*'] / all_data['m']
all_data['improvement'] = all_data['NS_score*'] - all_data['P1_score']
all_data['improvement%SAT'] = all_data['improvement'] / all_data['m']



all_data['m / n'] = all_data['m'] / all_data['n']

all_data.to_csv('./results/all_data.csv')

#all_data

In [None]:
all_data.shape

In [None]:
all_data[(all_data['NS_score'] < all_data['P1_score'])]

In [None]:
all_data[(all_data['NS_score*'] < all_data['NS_score'])]

In [None]:
all_data.describe()

In [None]:
all_data[all_data['n'] < 10000].describe()

# Understanding rows with NaN or Inf

In [None]:
nan_rows = all_data[all_data.isin([np.nan, np.inf, -np.inf]).any(1)].reset_index()

number_of_nan_rows = nan_rows.shape[0]
print(number_of_nan_rows)

nan_rows

 There are 134 rows with `NaN` or `Inf`.  In 120 of these, the problem was completely solved by local solvers.  In 12 of them, the preprocessor failed because every clause in P1_Unsat was also in P2_Unsat.  There were no clauses to improve upon. The other two had nodes but no edges in the decomposed graph using the new method.

# Saving Results to File

In [None]:
problems_tested = all_data.loc[all_data['trial'] == 0, ['problem', 'm', 'n', 'm / n']].reset_index(drop=True)
problems_tested.to_csv('./results/problems_tested.csv')
problems_tested

In [None]:
problems_tested.describe()

In [None]:
graph_decomposition = all_data[~all_data.isin([np.nan, np.inf, -np.inf]).any(1)].reset_index(drop=True)

column_order = graph_decomposition.columns.tolist()
column_order.remove('problem_type')
column_order.insert(1, 'problem_type')
graph_decomposition = graph_decomposition[column_order]

#graph_decomposition = graph_decomposition.sort_values(by=['problem_type', 'n'])

graph_decomposition.to_csv('./results/graph_decomposition.csv')

graph_decomposition

In [None]:
graph_decomposition[graph_decomposition['n'] < 10000].describe()

In [None]:
graph_decomposition.describe(percentiles=DESCRIBE_PERCENTILES)

In [None]:
graph_decomposition.sort_values(by=['improvement', 'n'])

# Analysis by Size

In [None]:
plt.figure(figsize=(20,10))
sns.scatterplot('n', 'improvement%SAT', hue='problem_type', data=graph_decomposition)
plt.legend(fontsize=18)
plt.xlabel('Number of Variables (n)', fontsize=24)
#plt.ylabel('Normalized Improvement (%SAT)', fontsize=24)
plt.ylabel('SAT++ / m', fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

# Analysis by Type

In [None]:
application_summary = graph_decomposition[graph_decomposition['problem_type'] == 'application'].describe(percentiles=DESCRIBE_PERCENTILES)
application_summary.to_csv('./results/application_summary.csv')

application_summary

In [None]:
crafted_summary = graph_decomposition[graph_decomposition['problem_type'] == 'crafted'].describe(percentiles=DESCRIBE_PERCENTILES)
crafted_summary.to_csv('./results/crafted_summary.csv')

crafted_summary

# Data Visualization

In [None]:
graph_decomposition['m / m*'].hist(bins=30)

In [None]:
plot_data = graph_decomposition[['m', 'n', 'k', 'm*', 'n*', 'k*', 'm / m*', 'n / n*', 'k / k*', 'q / q*', 'iterations', 'P1%SAT']].copy()
    
plot_data

In [None]:
log_plot_data = pd.DataFrame()
for column in plot_data.columns:
    log_plot_data[f'log({column})'] = np.log(plot_data[column])
    
log_plot_data

In [None]:
fig = pd.plotting.scatter_matrix(plot_data, figsize=(15, 15), marker='.', hist_kwds={'bins': 20}, s=60, alpha=.8)

In [None]:
plt.figure(figsize=(20,10))
m_dots = plt.scatter(graph_decomposition['P1%SAT'], graph_decomposition['NS%SAT'] - graph_decomposition['P1%SAT'], c='red')
n_dots = plt.scatter(graph_decomposition['P1%SAT'], graph_decomposition['NS*%SAT'] - graph_decomposition['P1%SAT'], c='black')
plt.legend((m_dots, n_dots), labels=['Without Preprocessing', 'With Preprocessing'], fontsize=24, markerscale=3)
plt.xlabel('P1 %SAT', fontsize=24)
plt.ylabel('New Score %SAT - P1%SAT (Improvement)', fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

In [None]:
plt.figure(figsize=(20,10))
m_dots = plt.scatter(graph_decomposition['shared_variables'], np.log(graph_decomposition['q']), c='red')
n_dots = plt.scatter(graph_decomposition['shared_variables'], np.log(graph_decomposition['q*']), c='black')
plt.legend((m_dots, n_dots), labels=['Without Preprocessing', 'With Preprocessing'], fontsize=24, markerscale=3)
plt.xlabel('Shared Variables', fontsize=24)
plt.ylabel('Components', fontsize=24)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

In [None]:
fig = pd.plotting.scatter_matrix(log_plot_data, figsize=(15, 15), marker='.', hist_kwds={'bins': 20}, s=60, alpha=.8)

# Paper Tables

In [None]:
sat_reduction = graph_decomposition[['n / n*', 'm / m*', 'k / k*', 'iterations']].describe()
sat_reduction.drop('count', inplace=True)
sat_reduction.to_csv('./results/sat_reduction.csv')
sat_reduction

In [None]:
px_graph_reduction = graph_decomposition[['n / n*', 'e / e*', 'c / c*', 'n_H / n_H*', 'e_H / e_H*', 'q / q*']].describe()
px_graph_reduction.drop('count', inplace=True)
px_graph_reduction.to_csv('./results/px_graph_reduction.csv')
px_graph_reduction