In [None]:
from sklearn.model_selection import StratifiedKFold
from scipy import sparse
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score

import numpy as np
import pandas as pd
import csv
import autosklearn.classification
import re

class BinaryClassification:
    
    N_GRAM_LENGTH = 10
    
    def __init__(self, folder):
        self.folder = folder
        self.comment_summary = dict()
        self.ngram_summary = dict() 
    
    def data(self, filename, column_comment, column_label):
        df = pd.read_csv(filename)
        for ind,col in df.iterrows():
            self.comment_summary[ind] = dict()
            self.comment_summary[ind]['type'] = col[column_label]
            self.comment_summary[ind]['comment'] = col[column_comment]
            
    def n_gram(self, filename):
        n_gram_id = 0
        with open(filename) as csvfile:
            reader = csv.reader(csvfile, delimiter='\t', quotechar='|')
            for row in reader:
                words = row[5].strip()
                term = tuple(row[5].strip().split(' '))
                if term not in self.ngram_summary:
                        self.ngram_summary[term] = dict()
                self.ngram_summary[term] = {'id':n_gram_id,'len':row[1],'gtf':row[2],'df':row[3],'sdf':row[4], 'term':row[5]}
                n_gram_id += 1
            
    def vectorization(self):
        for comment_index in self.comment_summary:
            comment = self.comment_summary[comment_index]['comment']
            self.comment_summary[comment_index]['vector'] = dict()
            comment_post_process = re.sub("\s+"," ",re.sub(r"[^A-Za-z0-9]+"," ",comment.replace("\t"," ").replace("\r\n"," ").lower())).split(" ")
            for i in range(len(comment_post_process)):
                for j in range(i,min(i+BinaryClassification.N_GRAM_LENGTH+1,len(comment_post_process))):
                    if(tuple(comment_post_process[i:j+1]) in self.ngram_summary):
                        if self.ngram_summary[tuple(comment_post_process[i:j+1])]['id'] not in self.comment_summary[comment_index]['vector']:
                            self.comment_summary[comment_index]['vector'][self.ngram_summary[tuple(comment_post_process[i:j+1])]['id']] = 0
                        self.comment_summary[comment_index]['vector'][self.ngram_summary[tuple(comment_post_process[i:j+1])]['id']] += 1
                        
    def temp(self):
        X = []
        y = []

        comment_summary = self.comment_summary
        vector = [0] * len(self.ngram_summary)
        for comment_index in self.comment_summary:
            for vector_index in comment_summary[comment_index]['vector']:
                vector[vector_index] = comment_summary[comment_index]['vector'][vector_index]
            X.append(vector)
            y.append(comment_summary[comment_index]['type'])
            
        return X,y
        
    def ten_fold(self,X,y):
        sss = StratifiedKFold(n_splits=10,shuffle=True,random_state=1)
        np_X,np_y = np.asarray(X),np.asarray(y)
        runner = 0
        print(sss)
        roc_auc_list = list()
        for train_index, test_index in sss.split(np_X,np_y):
            X_train, X_test = sparse.csr_matrix(np_X[train_index]), sparse.csr_matrix(np_X[test_index])
            y_train, y_test = np_y[train_index], np_y[test_index]
            y_test_class = (np.unique(y_test))
            
            automl = autosklearn.classification.AutoSklearnClassifier(
                ml_memory_limit=1024*32, time_left_for_this_task = 30 * 60, metric=autosklearn.metrics.f1_weighted
            )

            automl.fit(X_train.copy(), y_train.copy())
            automl.refit(X_train.copy(), y_train.copy())
            y_hat = automl.predict(X_test)
            predict_proba = automl.predict_proba(X_test)
            roc_auc = roc_auc_score(y_test, predict_proba ,average='weighted',multi_class='ovr',labels=y_test_class)
            log = automl.cv_results_
            with open(folder+"/"+str(runner)+".log","w") as file:
                file.write("log for "+str(runner)+"\n")
                file.write(str(log)+"\n")

            roc_auc_list.append(roc_auc)
            print("round:",runner,"Classification report", classification_report(y_test, y_hat))
            print("round:",runner,"ROC_AUC", roc_auc)
            print("round:",runner,"Confusion matrix", confusion_matrix(y_test, y_hat))
            print("show_models",automl.show_models())
            print("sprint_statistics",automl.sprint_statistics())    

            for i in range(len(y_test)):
                if y_test[i] != y_hat[i]:
                    print("round:",runner,"comment index:",test_index[i],"y_test:",y_test[i],"y_hat",y_hat[i])

            runner += 1
        print("roc_auc_list",roc_auc_list)