# Sandbox for machine learning on CSP instances and their optimized QAOA schedules
## Author: Aniruddha Bapat
### Date: 07/20/2018
---
## Import necessary modules

In [89]:
import pandas as pd
from pandas.plotting import scatter_matrix
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression 
from matplotlib import interactive
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

import numpy as np

---
## Dataset import

In [43]:
set1 = pd.read_table('Random2Sat/instanceData_QAOA1.dat', delim_whitespace=1)
set2 = pd.read_table('Random3Sat/instanceData_QAOA1.dat', delim_whitespace=1)

In [52]:
full = pd.concat([set1, set2])

---
## Convert data sets to dictionaries (might come in handy later)
Actually, might not need it after all...

In [55]:
set1dict = set1.to_dict('list')
set2dict = set2.to_dict('list')
fulldict = full.to_dict('list')
for i in fulldict.keys(): fulldict[i] = np.array(fulldict[i])

---
## Classifying datasets by their respective problem type
We mixed two datasets corresponding to random max2Sat and max3Sat instances. Each dataset consists of 1000 instances spanning a range of variable numbers and clause numbers. Our first goal will be to study qualitative differences in their QAOA schedule, and to classify them into their respective problem classes. 

In [117]:
# Define new features
full['NormStd']=full['IncidenceStd']/full['Nclauses']

# Now, specify what are features and what are labels
X = full[['Nvars','Cdensity','beta1', 'gamma1', 'Unsat']]
y = full['Label']
trainratio = 0.2
seed=7
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=trainratio, random_state=seed)

In [118]:
#scatter_matrix(full[['NormStd', 'Cdensity', 'Unsat', 'beta1', 'gamma1']]);

In [119]:
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
    kfold = model_selection.KFold(n_splits=10, random_state=seed)
    cv_results = model_selection.cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

LR: 0.860625 (0.012516)
LDA: 0.834375 (0.024085)
KNN: 0.808750 (0.034821)
CART: 0.811875 (0.037338)
NB: 0.822500 (0.032740)
SVM: 0.510625 (0.047108)
