In [1]:
from pyspark import SparkContext, SparkConf #spark
from pyspark.sql import SQLContext # for data frame
import re #for escape string 


In [2]:
import graphlab as gl #for graph, sframe

In [3]:
conf = SparkConf().setAppName("error_machine_part_3")
sc = SparkContext(conf=conf)

In [4]:
def Count_Lines(df_path):
    '''
    counting lines of the file
    '''
    rdd_hdfs = sc.textFile(df_path)
    count_lines = rdd_hdfs.count()
    return count_lines

In [5]:
def Word_Count_Each_Text(df_path):
    '''
    [(u'', 483648),
     (u'virtual', 11546),
     (u'jerry', 11365),
     (u'Jan', 11364),
     (u'machine', 11364),
     (u'NetworkManager', 4390),
     (u'info', 4327)]
    '''
    rdd_hdfs = sc.textFile(df_path)
    count_lines = rdd_hdfs.count()
    # spliting by special characters and number
    rdd_split_map = rdd_hdfs.flatMap(lambda x: re.split(r'[`\- =~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?0-9]', x),1)
    #ready to count
    rdd_count_map = rdd_split_map.map(lambda x: (x, 1))
    #mapping each key and counting it
    res_reduce = rdd_count_map.reduceByKey(lambda a, b: a+b )
    # desc sorting by counting number
    res_reduce = res_reduce.sortBy(lambda c: c[1], False)
    return res_reduce

In [6]:
def Pick_Top_Text(rdd_wc_reduce, count_lines):
    '''
    picking string which always show and counting bigger than file lines  (deviation value 2)
    '''
    pick_top_string_map = rdd_wc_reduce.filter(lambda x: x[1] >= count_lines-2)
    pick_top_string_map.collect()
    return pick_top_string_map.collect()

In [7]:
def Map_Filter_Top_Texts(line, list_tops):
    '''
    filter all the top text 
    if len(text) > len(file lines):
    then delete text
    '''
    res_str = ""
    for text in line:
        flag = 0
        if str(text) == "":
            continue;
        for top in list_tops:
            #res_str +='----' +top[0]
            if top[0] == str(text):

                flag = 1
                break;
        if(flag == 1):
            continue;
        else:
            res_str +="," + str(text)
    return res_str[1:]

In [8]:
def Gen_Relative_Table(col_rdd):
    '''
    0 1 2
    A B C
    
    relative[A] = [B]
    relative[B] = [A,C]
    relative[C] = [B]
    '''
    dist_text = {}
    for str_line in col_rdd:
        line = str_line.split(',')
        for i in range(0,len(line)):
            now = line[i]
            if now not in dist_text:
                dist_text[now] = []
            if i == 0 and i != len(line) -1:
                dist_text[now].append( line[i+1] )
            elif i == len(line) -1 :
                dist_text[now].append( line[i-1] )
            else:
                dist_text[now].append( line[i+1] )
                dist_text[now].append( line[i-1] )

    for text in dist_text:
        set_dist = set(dist_text[text])
        dist_text[text] = list(set_dist)
    return dist_text

In [9]:

def Count_Each_Line_Score_zero(col_rdd, relative_table):
    pass
    '''
    for each text
        query relative table if not found this keyword ->score = 0
        if found this keyword and find it's relative world -> score += 1
    line score = text score / (line length - 1)

    list_res = []
    list_zero_line_number = []
    list_zero_line_content = []
    number = 0
    for str_line in col_rdd:
        number += 1
        line = str_line.split(',')
        score_line = 0.0
        len_line = len(line)
        if len_line > 1:
            for i in range(0,len_line):
                now = line[i]
                if now not in relative_table:
                    score_line += 0
                elif i != (len_line -1):
                    next = line[i+1]
                    for relative in relative_table[now]:
                        if next == relative:
                            score_line += 1
            score_line = score_line/(len_line-1)
        list_res.append(score_line)
        if score_line == 0.0:
            list_zero_line_number.append(number)
            list_zero_line_content.append(str_line)
    return (list_res,list_zero_line_number,list_zero_line_content)
    '''
            
        

In [10]:
def Count_Each_Line_Score(col_rdd, relative_table):
    '''
    for each text
        len(text) in a line < 1 => score = 0
        query relative table if not found this keyword =>score = 0
        if finding this keyword and finding it's relative world => score += 1
    line score = text score / (line length - 1)
    
    relative[A] = [B]
    relative[B] = [A,C]
    relative[C] = [B]
    
    ex: line 1 = ABDCA 
        => 1+1+0+0
    '''
    list_score = []
    list_line_number = []
    list_error_content = []
    number = 0
    for str_line in col_rdd:
        number += 1
        line = str_line.split(',')
        score_line = 0.0
        len_line = len(line)
        error_text = ""
        if len_line > 1:
            for i in range(0,len_line):
                tag_find = 0
                now = line[i]
                if now not in relative_table:
                    score_line += 0
                    error_text += ","
                    error_text += now
                elif i != (len_line -1):
                    next = line[i+1]
                    for relative in relative_table[now]:
                        if next == relative:
                            score_line += 1
                            tag_find = 1
                            break;
                    if tag_find == 0:
                        error_text += ","
                        error_text += now
            score_line = score_line/(len_line-1)
        else:
            error_text += ","
            error_text += now
            
        list_score.append(score_line)
        list_line_number.append(number)
        list_error_content.append(error_text[1:])       
    return (list_line_number, list_score, list_error_content)

In [11]:
def Delete_Top_Text_for_Each_Line(df_path, top_collect):
    '''
    filter all top text for each line
    '''
    rdd_hdfs = sc.textFile(df_path)
    # spliting by special characters
    rdd_sp_map = rdd_hdfs.map(lambda x: re.split(r'[`\- =~!@#$%^&*()_+\[\]{};\'\\:"|<,./<>?0-9]', x),1)
    #filter all top text for each line
    rdd_fil_map = rdd_sp_map.map(lambda x:Map_Filter_Top_Texts(x, top_collect) )
    return rdd_fil_map

In [12]:
def Gen_Relative_Table_by_Filter_Map(rdd_fil_map):
    '''
    before this function, all text line must to filter by texts of top rank
    then generate a relation table
    '''
    col_rdd_fil = rdd_fil_map.collect()
    dist_relative_table = Gen_Relative_Table(col_rdd_fil)
    return dist_relative_table

In [13]:
def Query_Score_Range_Lines(score_lines, list_error_content, int_start_score, int_end_score):
    '''
    according the range of score to generate list
    '''
    len_lines = len(score_lines)
    list_content = []
    list_score = []
    list_number = []
    for i in range(0,len_lines):
        score = score_lines[i]
        if score >= int_start_score and score <= int_end_score:
            list_content.append(list_error_content [i])
            list_score.append(score)
            list_number.append(i+1)
    return (list_number,list_score,list_content)

In [14]:
def Main_Normal_Case(input_path):
    count_lines = Count_Lines(input_path)
    red_wordcount = Word_Count_Each_Text(input_path)
    col_red_top = Pick_Top_Text(red_wordcount, count_lines)
    rdd_map_filiter_top = Delete_Top_Text_for_Each_Line(input_path, col_red_top)
    dist_table = Gen_Relative_Table_by_Filter_Map(rdd_map_filiter_top)
    return dist_table

In [15]:
def Main_Compare_Case(input_path, normal_path):
    count_lines = Count_Lines(input_path)
    red_wordcount = Word_Count_Each_Text(input_path)
    col_red_top = Pick_Top_Text(red_wordcount, count_lines)
    rdd_map_filiter_top = Delete_Top_Text_for_Each_Line(input_path, col_red_top)
    dist_main_relative_table = Main_Normal_Case(normal_path)
    number, list_score,list_error_content = Count_Each_Line_Score(rdd_map_filiter_top.collect(), dist_main_relative_table)
    return number, list_score, list_error_content

In [16]:
inputPath_normal = '/user/spark/input/log_normal'

In [17]:
inputPath_compare = '/user/spark/input/log_error'

In [18]:
list_number, list_score, list_error_content = Main_Compare_Case(inputPath_compare, inputPath_normal)

In [19]:
len_list = len(list_score)
for i in range(0,len_list):
    print list_number[i],list_score[i],list_error_content[i]


1 1.0 
2 1.0 
3 1.0 
4 1.0 
5 1.0 
6 1.0 
7 1.0 
8 1.0 
9 1.0 
10 1.0 
11 1.0 
12 1.0 
13 1.0 
14 1.0 
15 1.0 
16 1.0 
17 1.0 
18 1.0 
19 1.0 
20 1.0 
21 1.0 
22 1.0 
23 1.0 
24 1.0 
25 0.3 CMD,command,v,debian,sa,null,debian,sa
26 1.0 
27 1.0 
28 1.0 
29 1.0 
30 1.0 
31 1.0 
32 1.0 
33 1.0 
34 1.0 
35 1.0 
36 1.0 
37 1.0 
38 1.0 
39 1.0 
40 1.0 
41 1.0 
42 1.0 
43 1.0 
44 1.0 
45 0.3 CMD,command,v,debian,sa,null,debian,sa
46 1.0 
47 1.0 
48 1.0 
49 1.0 
50 1.0 
51 0.3 CMD,command,v,debian,sa,null,debian,sa
52 1.0 
53 1.0 
54 1.0 
55 1.0 
56 1.0 
57 1.0 
58 1.0 
59 1.0 
60 1.0 
61 1.0 
62 1.0 
63 1.0 
64 1.0 
65 1.0 
66 1.0 
67 1.0 
68 1.0 
69 1.0 
70 1.0 
71 1.0 
72 1.0 
73 1.0 
74 0.3 CMD,command,v,debian,sa,null,debian,sa
75 1.0 
76 1.0 
77 1.0 
78 1.0 
79 1.0 
80 1.0 
81 1.0 
82 1.0 
83 1.0 
84 1.0 
85 1.0 
86 1.0 
87 1.0 
88 1.0 
89 1.0 
90 1.0 
91 1.0 
92 1.0 
93 1.0 
94 1.0 
95 0.3 CMD,command,v,debian,sa,null,debian,sa
96 1.0 
97 1.0 
98 1.0 
99 1.0 
100 1.0 
101 1.0 
102 1.0 


In [20]:
list_number_q,list_score_q,list_content_q = Query_Score_Range_Lines(list_score, list_error_content, 0, 0.2)

In [21]:
len_list = len(list_content_q)
for i in range(0,len_list):
    print list_number_q[i],round(list_score_q[i]*100)/100,list_content_q[i]

456 0.0 kernel,btrfs,xor
457 0.0 kernel,xor
459 0.0 kernel,raid,pq
460 0.0 kernel,raid,pq
461 0.0 kernel,btrfs,xor,raid,pq,ufs,qnx
462 0.0 kernel,ufs,ufs,qnx
463 0.0 kernel,qnx,hfsplus
464 0.0 kernel,hfsplus,hfs
465 0.0 kernel,hfs,minix
466 0.0 kernel,minix
467 0.0 kernel,ntfs,msdos
468 0.0 kernel,ntfs,hfsplus,hfs
469 0.0 kernel,jfs
470 0.0 kernel,minix
471 0.0 kernel,msdos
472 0.0 kernel,ntfs
473 0.0 kernel,jfs,xfs
474 0.0 kernel,msdos,jfs,xfs
475 0.0 kernel,xfs,libcrc
476 0.2 kernel,libcrc,c,libcrc
487 0.0 kernel,vhost,vhost
488 0.0 kernel,macvtap
489 0.0 kernel
490 0.0 kernel
491 0.0 kernel
492 0.0 kernel
526 0.0 kernel,coretemp
534 0.0 kernel,snd
535 0.0 kernel,kvm
536 0.0 kernel
537 0.0 kernel
541 0.0 kernel,irqbypass
545 0.0 kernel,nfit
553 0.0 kernel
559 0.0 kernel
560 0.0 kernel
564 0.0 kernel
565 0.0 kernel
566 0.0 kernel
567 0.0 kernel
568 0.0 kernel
569 0.0 kernel
570 0.0 kernel
571 0.0 kernel
573 0.0 kernel
574 0.0 kernel
578 0.0 kernel
579 0.0 kernel
580 0.0 kernel
581 0.0

In [22]:
sf_error = gl.SFrame({'no.': list_number_q, 'score': list_score_q, 'error_text': list_content_q})

This non-commercial license of GraphLab Create for academic use is assigned to sayyes566@gmail.com and will expire on March 01, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1490084597.log


In [23]:
sf_error.head

<bound method SFrame.head of Columns:
	error_text	str
	no.	int
	score	float

Rows: 159

Data:
+-------------------------------+-----+-------+
|           error_text          | no. | score |
+-------------------------------+-----+-------+
|        kernel,btrfs,xor       | 456 |  0.0  |
|           kernel,xor          | 457 |  0.0  |
|         kernel,raid,pq        | 459 |  0.0  |
|         kernel,raid,pq        | 460 |  0.0  |
| kernel,btrfs,xor,raid,pq,u... | 461 |  0.0  |
|       kernel,ufs,ufs,qnx      | 462 |  0.0  |
|       kernel,qnx,hfsplus      | 463 |  0.0  |
|       kernel,hfsplus,hfs      | 464 |  0.0  |
|        kernel,hfs,minix       | 465 |  0.0  |
|          kernel,minix         | 466 |  0.0  |
+-------------------------------+-----+-------+
[159 rows x 3 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.>

In [24]:
sf_error.show()

Canvas is accessible via web browser at the URL: http://localhost:35781/index.html
Opening Canvas in default web browser.


In [25]:
gl.canvas.set_target('ipynb')

In [26]:
sf_error['error_text'].show(view="Categorical")

In [27]:
sf_error['score'].show(view="Categorical")

In [28]:
sf_error.show(view = "Bar Chart", x="error_text", y = "score")

In [29]:
sf_error.show(view = "Scatter Plot", x="no.", y = "score")