In [4]:
import autosklearn.classification
import csv
import json
import math
import matplotlib.pyplot as plt
import numpy as np
import re
import time
import warnings

from collections import defaultdict, OrderedDict
from myconfig import *
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, classification_report, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import ExtraTreesClassifier

N_GRAM_LENGTH = 10
TOTAL_TYPE = 2

def read_raw_data():
    with open(DATA_SOURCE+"/dataset.csv", 'r',encoding='latin1') as csvfile:
        global total_document
        reader = csv.DictReader(csvfile)
        for row in reader:
            if row['project'] == project_name:
                comment_summary[total_document] = dict()
                comment_summary[total_document]['type'] = int(row['oracle'])
                comment_summary[total_document]['comment'] = row['text']
                total_document += 1

def file_len(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1   

#read data from n_gram
def read_n_gram():
    n_gram_id = 0
    total_n_gram = file_len(DATA_SOURCE+"/importance_features/"+project_name+'_'+oracle+'_n_gram_filter')
    with open(DATA_SOURCE+"/importance_features/"+project_name+'_'+oracle+"_n_gram_filter") as csvfile:
        reader = csv.reader(csvfile, delimiter='\t', quotechar='|')
        for row in reader:
            words = row[5].strip()
            term = tuple(row[5].strip().split(' '))
            if term not in summary:
                summary[term] = dict()
            summary[term] = {'id':n_gram_id,'len':row[1],'gtf':row[2],'df':row[3],'sdf':row[4], 'term':row[5]}
            weight1 = total_document / int(row[4])
            summary[term]['score'] = math.log10(weight1) * int(row[2])
            score[n_gram_id] =  math.log10(weight1) * int(row[2])
            n_grams[n_gram_id] = term
            n_gram_id += 1

def top_score_vector():
    percent = int(len(score) * 100 / 100)
    top_vector.extend(sorted(score,key=score.get,reverse=True)[:percent])
#     print("top_vector",len(top_vector))

def n_gram_split():
    for comment_index in comment_summary:
        comment = comment_summary[comment_index]['comment']
        comment_summary[comment_index]['vector'] = dict()
        comment_post_process = re.sub("\s+"," ",re.sub(r"[^A-Za-z0-9]+"," ",comment.replace("\t"," ").replace("\r\n"," ").lower())).split(" ")
        for i in range(len(comment_post_process)):
            for j in range(i,min(i+N_GRAM_LENGTH+1,len(comment_post_process))):
                if(tuple(comment_post_process[i:j+1]) in summary):
                    if(summary[tuple(comment_post_process[i:j+1])]['id'] in top_vector):
                        if summary[tuple(comment_post_process[i:j+1])]['id'] in comment_summary[comment_index]['vector']:
                            comment_summary[comment_index]['vector'][summary[tuple(comment_post_process[i:j+1])]['id']] += 1
                        else: 
                            comment_summary[comment_index]['vector'][summary[tuple(comment_post_process[i:j+1])]['id']] = 1

def vector_idf():
    for i in comment_summary:
         for v in comment_summary[i]['vector']:
            comment_summary[i]['vector'][v] *= score[v]

for project_name in ["stackoverflow","jira","appreviews"]:
    for oracle in ["1","0","-1"]:
        if project_name == "jira" and oracle == "0":
            continue
        summary = dict()
        total_n_gram = 0
        total_document = 0
        score = dict()
        comment_summary = dict()
        top_vector = list()
        n_grams = dict()
        read_raw_data()
        read_n_gram()
        top_score_vector()
        n_gram_split()

        X = []
        y = []
        for comment_index in comment_summary:
            vector = [0] * len(top_vector)
            for vector_index in comment_summary[comment_index]['vector']:
                vector[top_vector.index(vector_index)] = comment_summary[comment_index]['vector'][vector_index]
            X.append(vector)
            y.append(comment_summary[comment_index]['type'])
            
        np_X = np.asarray(X)
        np_y = np.asarray(y)

        # forest = ExtraTreesClassifier(n_estimators=250, random_state=0)
        forest = RandomForestClassifier(n_estimators=10,random_state=1)

        forest.fit(np_X, np_y)
        importances = forest.feature_importances_
        std = np.std([tree.feature_importances_ for tree in forest.estimators_],
                     axis=0)
        indices = np.argsort(importances)[::-1]

        # Print the feature ranking
#         print("Feature ranking:")

        print_until = 1000
        runner = 0
        for f in range(np_X.shape[1]):
            if runner >= print_until:
                break
            symbol = "+"
            if oracle == "0":
                symbol = "N"
            elif oracle == "-1":
                symbol = "-"
            print("%s\t%s\t%d\t%s" % (project_name,symbol,f + 1,  n_grams[indices[f]]))
            runner += 1

stackoverflow	+	1	('onto',)
stackoverflow	+	2	('option',)
stackoverflow	+	3	('have',)
stackoverflow	+	4	('way', 'to')
stackoverflow	+	5	('than',)
stackoverflow	+	6	('bit', 'of')
stackoverflow	+	7	('design',)
stackoverflow	+	8	('earlier',)
stackoverflow	+	9	('still',)
stackoverflow	+	10	('help', 'you')
stackoverflow	+	11	('library',)
stackoverflow	+	12	('part',)
stackoverflow	+	13	('optimized',)
stackoverflow	+	14	('less',)
stackoverflow	+	15	('run', 'time')
stackoverflow	+	16	('classes',)
stackoverflow	+	17	('memory',)
stackoverflow	+	18	('run',)
stackoverflow	+	19	('like',)
stackoverflow	+	20	('impact',)
stackoverflow	+	21	('looking', 'for')
stackoverflow	+	22	('better',)
stackoverflow	+	23	('put',)
stackoverflow	+	24	('databases', 'are')
stackoverflow	+	25	('are',)
stackoverflow	+	26	('jsf',)
stackoverflow	+	27	('working',)
stackoverflow	+	28	('code', 'fragment', 'to')
stackoverflow	+	29	('java', 'is')
stackoverflow	+	30	('json',)
stackoverflow	+	31	('now',)
stackoverflow	+	32	('only

stackoverflow	N	1	('received',)
stackoverflow	N	2	('everyone',)
stackoverflow	N	3	('the', 'customer')
stackoverflow	N	4	('local',)
stackoverflow	N	5	('selenium',)
stackoverflow	N	6	('the', 'console', 'tells', 'me')
stackoverflow	N	7	('20',)
stackoverflow	N	8	('region', 's', 'wall', 'clock', 'time')
stackoverflow	N	9	('technical',)
stackoverflow	N	10	('hc',)
stackoverflow	N	11	('against',)
stackoverflow	N	12	('appointments',)
stackoverflow	N	13	('ideally',)
stackoverflow	N	14	('object', 'in')
stackoverflow	N	16	('renderer',)
stackoverflow	N	17	('of', 'objects')
stackoverflow	N	18	('host',)
stackoverflow	N	19	('direct',)
stackoverflow	N	20	('provided',)
stackoverflow	N	21	('decorating',)
stackoverflow	N	22	('newsletter',)
stackoverflow	N	23	('sends',)
stackoverflow	N	24	('graph',)
stackoverflow	N	25	('println',)
stackoverflow	N	26	('when', 'you', 'get')
stackoverflow	N	27	('templates',)
stackoverflow	N	28	('matching',)
stackoverflow	N	29	('had',)
stackoverflow	N	30	('book', 'time')
stack

stackoverflow	-	1	('proxy',)
stackoverflow	-	2	('wrote',)
stackoverflow	-	3	('the', 'file', 'name')
stackoverflow	-	4	('run',)
stackoverflow	-	5	('drl',)
stackoverflow	-	6	('possible',)
stackoverflow	-	7	('correct',)
stackoverflow	-	8	('ran',)
stackoverflow	-	9	('translucent',)
stackoverflow	-	10	('rather',)
stackoverflow	-	11	('saving',)
stackoverflow	-	12	('jpa', 'hibernate')
stackoverflow	-	13	('guess',)
stackoverflow	-	14	('now',)
stackoverflow	-	15	('part',)
stackoverflow	-	16	('tool',)
stackoverflow	-	17	('cglib',)
stackoverflow	-	18	('may', 'not')
stackoverflow	-	19	('solution',)
stackoverflow	-	20	('n',)
stackoverflow	-	21	('sort',)
stackoverflow	-	22	('must', 'be')
stackoverflow	-	23	('except',)
stackoverflow	-	24	('possible', 'to')
stackoverflow	-	25	('wrong',)
stackoverflow	-	26	('token', 'and')
stackoverflow	-	27	('may',)
stackoverflow	-	28	('processing',)
stackoverflow	-	29	('me',)
stackoverflow	-	30	('showing',)
stackoverflow	-	31	('it', 'did', 'n', 't')
stackoverflow	-	3

jira	-	1	('already',)
jira	-	2	('part',)
jira	-	3	('some',)
jira	-	4	('years',)
jira	-	5	('soon',)
jira	-	6	('grumble',)
jira	-	7	('him',)
jira	-	8	('find',)
jira	-	9	('8',)
jira	-	10	('brain',)
jira	-	11	('c2', '2')
jira	-	12	('report',)
jira	-	13	('a', 'bug')
jira	-	14	('s', 'not')
jira	-	15	('getchannel',)
jira	-	16	('cocoon', 'servlet', 'service')
jira	-	17	('issues',)
jira	-	18	('unlikely',)
jira	-	19	('vikram',)
jira	-	20	('lib',)
jira	-	21	('committed',)
jira	-	22	('filechannel',)
jira	-	23	('references', 'to')
jira	-	24	('related',)
jira	-	25	('second',)
jira	-	26	('id',)
jira	-	27	('place',)
jira	-	28	('org',)
jira	-	29	('job',)
jira	-	30	('app', 'developers')
jira	-	31	('right',)
jira	-	32	('n', 't')
jira	-	33	('last', 'patch')
jira	-	34	('over',)
jira	-	35	('go', 'to')
jira	-	36	('any',)
jira	-	37	('problems',)
jira	-	38	('nag',)
jira	-	39	('tests',)
jira	-	40	('more',)
jira	-	41	('expect',)
jira	-	42	('ignore',)
jira	-	43	('look',)
jira	-	44	('uima',)
jira	-	45	('who',)
jir

appreviews	+	57	('thing', 'missing')
appreviews	+	58	('nice', 'and')
appreviews	+	59	('bright',)
appreviews	+	60	('bubble',)
appreviews	+	61	('note', '10', '1')
appreviews	+	62	('overall',)
appreviews	+	63	('car',)
appreviews	+	64	('has', 'everything')
appreviews	+	65	('daily',)
appreviews	+	66	('really', 'like', 'this', 'app')
appreviews	+	67	('there', 'is')
appreviews	+	68	('button',)
appreviews	+	69	('buy', 'it')
appreviews	+	70	('selection', 'of')
appreviews	+	71	('try', 'it')
appreviews	+	72	('tools',)
appreviews	+	73	('making', 'it')
appreviews	+	74	('emoticons',)
appreviews	+	75	('work', 'on')
appreviews	+	76	('if', 'it')
appreviews	+	77	('such', 'a', 'feature')
appreviews	+	78	('because',)
appreviews	+	79	('at',)
appreviews	+	80	('dis',)
appreviews	+	81	('providers',)
appreviews	+	82	('cool', 'this')
appreviews	+	83	('it', 'to')
appreviews	+	84	('says',)
appreviews	+	85	('many',)
appreviews	+	86	('detection',)
appreviews	+	87	('deals',)
appreviews	+	88	('issue',)
appreviews	+	8

appreviews	-	1	('livejournal',)
appreviews	-	2	('color',)
appreviews	-	3	('period',)
appreviews	-	4	('fill', 'the', 'screen')
appreviews	-	5	('okay', 'but')
appreviews	-	6	('option',)
appreviews	-	7	('nav',)
appreviews	-	8	('right',)
appreviews	-	9	('comics',)
appreviews	-	10	('etc',)
appreviews	-	11	('picture',)
appreviews	-	12	('airplane', 'mode')
appreviews	-	13	('alone',)
appreviews	-	14	('ads',)
appreviews	-	15	('using',)
appreviews	-	16	('things',)
appreviews	-	17	('impossible', 'to', 'use')
appreviews	-	18	('rate',)
appreviews	-	19	('should', 'be')
appreviews	-	20	('pages',)
appreviews	-	21	('responding',)
appreviews	-	22	('some', 'features')
appreviews	-	23	('poor',)
appreviews	-	24	('sufficient',)
appreviews	-	25	('very',)
appreviews	-	26	('size',)
appreviews	-	27	('lack',)
appreviews	-	28	('more', 'than')
appreviews	-	29	('literally',)
appreviews	-	30	('eats',)
appreviews	-	31	('fits',)
appreviews	-	32	('game', 'constantly', 'freezes')
appreviews	-	33	('many',)
appreviews	-	3