In [7]:
import numpy as np
from sklearn.linear_model import LogisticRegression
import sklearn.utils

class LR(object):

    def __init__(self, penalty='l2', C=100, tol=0.01, class_weight=None, max_iter=100):
        """ The Invariants Mining model for anomaly detection
        Attributes
        ----------
            classifier: object, the classifier for anomaly detection
        """
        self.classifier = LogisticRegression(penalty=penalty, C=C, tol=tol, class_weight=class_weight,
                                             max_iter=max_iter)

    def fit(self, X, y):
        """
        Arguments
        ---------
            X: ndarray, the event count matrix of shape num_instances-by-num_events
        """
        print('====== Model summary ======')
        self.classifier.fit(X, y)

    def predict(self, X):
        """ Predict anomalies with mined invariants
        Arguments
        ---------
            X: the input event count matrix
        Returns
        -------
            y_pred: ndarray, the predicted label vector of shape (num_instances,)
        """
        y_pred = self.classifier.predict(X)
        return y_pred

    def evaluate(self, X, y_true):
        print('====== Evaluation summary ======')
        y_pred = self.predict(X)
        precision, recall, f1 = metrics(y_pred, y_true)
        print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
        return precision, recall, f1

In [8]:

import numpy as np
from sklearn import svm
import sklearn.utils 

class SVM(object):

    def __init__(self, penalty='l1', tol=0.1, C=1, dual=False, class_weight=None, 
                 max_iter=100):
        """ The Invariants Mining model for anomaly detection
        Arguments
        ---------
        See SVM API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
        
        Attributes
        ----------
            classifier: object, the classifier for anomaly detection
        """
        self.classifier = svm.LinearSVC(penalty=penalty, tol=tol, C=C, dual=dual, 
                                        class_weight=class_weight, max_iter=max_iter)

    def fit(self, X, y):
        """
        Arguments
        ---------
            X: ndarray, the event count matrix of shape num_instances-by-num_events
        """
        print('====== Model summary ======')
        self.classifier.fit(X, y)

    def predict(self, X):
        """ Predict anomalies with mined invariants
        Arguments
        ---------
            X: the input event count matrix
        Returns
        -------
            y_pred: ndarray, the predicted label vector of shape (num_instances,)
        """
        
        y_pred = self.classifier.predict(X)
        return y_pred

    def evaluate(self, X, y_true):
        print('====== Evaluation summary ======')
        y_pred = self.predict(X)
        precision, recall, f1 = metrics(y_pred, y_true)
        print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
        return precision, recall, f1

In [9]:
import numpy as np
import sklearn.utils

class PCA(object):

    def __init__(self, n_components=0.95, threshold=None, c_alpha=3.2905):
        """ The PCA model for anomaly detection
        Attributes
        ----------
            proj_C: The projection matrix for projecting feature vector to abnormal space
            n_components: float/int, number of principal compnents or the variance ratio they cover
            threshold: float, the anomaly detection threshold. When setting to None, the threshold 
                is automatically caculated using Q-statistics
            c_alpha: float, the c_alpha parameter for caculating anomaly detection threshold using 
                Q-statistics. The following is lookup table for c_alpha:
                c_alpha = 1.7507; # alpha = 0.08
                c_alpha = 1.9600; # alpha = 0.05
                c_alpha = 2.5758; # alpha = 0.01
                c_alpha = 2.807; # alpha = 0.005
                c_alpha = 2.9677;  # alpha = 0.003
                c_alpha = 3.2905;  # alpha = 0.001
                c_alpha = 3.4808;  # alpha = 0.0005
                c_alpha = 3.8906;  # alpha = 0.0001
                c_alpha = 4.4172;  # alpha = 0.00001
        """

        self.proj_C = None
        self.components = None
        self.n_components = n_components
        self.threshold = threshold
        self.c_alpha = c_alpha


    def fit(self, X):
        """
        Auguments
        ---------
            X: ndarray, the event count matrix of shape num_instances-by-num_events
        """

        print('====== Model summary ======')
        num_instances, num_events = X.shape
        X_cov = np.dot(X.T, X) / float(num_instances)
        U, sigma, V = np.linalg.svd(X_cov)
        n_components = self.n_components
        if n_components < 1:
            total_variance = np.sum(sigma)
            variance = 0
            for i in range(num_events):
                variance += sigma[i]
                if variance / total_variance >= n_components:
                    break
            n_components = i + 1

        P = U[:, :n_components]
        I = np.identity(num_events, int)
        self.components = P
        self.proj_C = I - np.dot(P, P.T)
        print('n_components: {}'.format(n_components))
        print('Project matrix shape: {}-by-{}'.format(self.proj_C.shape[0], self.proj_C.shape[1]))

        if not self.threshold:
            # Calculate threshold using Q-statistic. Information can be found at:
            # http://conferences.sigcomm.org/sigcomm/2004/papers/p405-lakhina111.pdf
            phi = np.zeros(3)
            for i in range(3):
                for j in range(n_components, num_events):
                    phi[i] += np.power(sigma[j], i + 1)
            h0 = 1.0 - 2 * phi[0] * phi[2] / (3.0 * phi[1] * phi[1])
            self.threshold = phi[0] * np.power(self.c_alpha * np.sqrt(2 * phi[1] * h0 * h0) / phi[0]
                                               + 1.0 + phi[1] * h0 * (h0 - 1) / (phi[0] * phi[0]), 
                                               1.0 / h0)
        print('SPE threshold: {}\n'.format(self.threshold))

    def predict(self, X):
        assert self.proj_C is not None, 'PCA model needs to be trained before prediction.'
        y_pred = np.zeros(X.shape[0])
        for i in range(X.shape[0]):
            y_a = np.dot(self.proj_C, X[i, :])
            SPE = np.dot(y_a, y_a)
            if SPE > self.threshold:
                y_pred[i] = 1
        return y_pred

    def evaluate(self, X, y_true):
        print('====== Evaluation summary ======')
        y_pred = self.predict(X)
        precision, recall, f1 = metrics(y_pred, y_true)
        print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
        return precision, recall, f1

In [10]:
"""
The implementation of the decision tree model for anomaly detection.
Authors: 
    LogPAI Team
Reference: 
    [1] Mike Chen, Alice X. Zheng, Jim Lloyd, Michael I. Jordan, Eric Brewer. 
        Failure Diagnosis Using Decision Trees. IEEE International Conference 
        on Autonomic Computing (ICAC), 2004.
"""

import numpy as np
from sklearn import tree
import sklearn.utils

class DecisionTree(object):

    def __init__(self, criterion='gini', max_depth=None, max_features=None, class_weight=None):
        """ The Invariants Mining model for anomaly detection
        Arguments
        ---------
        See DecisionTreeClassifier API: https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html
        Attributes
        ----------
            classifier: object, the classifier for anomaly detection
        """
        self.classifier = tree.DecisionTreeClassifier(criterion=criterion, max_depth=max_depth,
                          max_features=max_features, class_weight=class_weight)

    def fit(self, X, y):
        """
        Arguments
        ---------
            X: ndarray, the event count matrix of shape num_instances-by-num_events
        """
        print('====== Model summary ======')
        self.classifier.fit(X, y)

    def predict(self, X):
        """ Predict anomalies with mined invariants
        Arguments
        ---------
            X: the input event count matrix
        Returns
        -------
            y_pred: ndarray, the predicted label vector of shape (num_instances,)
        """
        
        y_pred = self.classifier.predict(X)
        return y_pred

    def evaluate(self, X, y_true):
        print('====== Evaluation summary ======')
        y_pred = self.predict(X)
        precision, recall, f1 = metrics(y_pred, y_true)
        print('Precision: {:.3f}, recall: {:.3f}, F1-measure: {:.3f}\n'.format(precision, recall, f1))
        return precision, recall, f1

In [None]:
#https://towardsdatascience.com/random-forest-in-python-24d0893d51c0

# Use numpy to convert to arrays
import numpy as np
# Labels are the values we want to predict
labels = np.array(features['actual'])
# Remove the labels from the features
# axis 1 refers to the columns
features= features.drop('actual', axis = 1)
# Saving feature names for later use
feature_list = list(features.columns)
# Convert to numpy array
features = np.array(features)

# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split
# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)
Training Features Shape: (261, 14)
Training Labels Shape: (261,)
Testing Features Shape: (87, 14)
Testing Labels Shape: (87,)
    
    
# The baseline predictions are the historical averages
baseline_preds = test_features[:, feature_list.index('average')]
# Baseline errors, and display average baseline error
baseline_errors = abs(baseline_preds - test_labels)
print('Average baseline error: ', round(np.mean(baseline_errors), 2))
Average baseline error:  5.06 degrees.
    
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(train_features, train_labels);

# Use the forest's predict method on the test data
predictions = rf.predict(test_features)
# Calculate the absolute errors
errors = abs(predictions - test_labels)
# Print out the mean absolute error (mae)
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
Mean Absolute Error: 3.83 degrees.
    
# Calculate mean absolute percentage error (MAPE)
mape = 100 * (errors / test_labels)
# Calculate and display accuracy
accuracy = 100 - np.mean(mape)
print('Accuracy:', round(accuracy, 2), '%.')
Accuracy: 93.99 %.
    
    
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Import tools needed for visualization
from sklearn.tree import export_graphviz
import pydot
# Pull out one tree from the forest
tree = rf.estimators_[5]
# Export the image to a dot file
export_graphviz(tree, out_file = 'tree.dot', feature_names = feature_list, rounded = True, precision = 1)
# Use dot file to create a graph
(graph, ) = pydot.graph_from_dot_file('tree.dot')
# Write graph to a png file
graph.write_png('tree.png')


# Limit depth of tree to 3 levels
rf_small = RandomForestRegressor(n_estimators=10, max_depth = 3)
rf_small.fit(train_features, train_labels)
# Extract the small tree
tree_small = rf_small.estimators_[5]
# Save the tree as a png image
export_graphviz(tree_small, out_file = 'small_tree.dot', feature_names = feature_list, rounded = True, precision = 1)
(graph, ) = pydot.graph_from_dot_file('small_tree.dot')
graph.write_png('small_tree.png');

# Get numerical feature importances
importances = list(rf.feature_importances_)
# List of tuples with variable and importance
feature_importances = [(feature, round(importance, 2)) for feature, importance in zip(feature_list, importances)]
# Sort the feature importances by most important first
feature_importances = sorted(feature_importances, key = lambda x: x[1], reverse = True)
# Print out the feature and importances 
[print('Variable: {:20} Importance: {}'.format(*pair)) for pair in feature_importances];


# New random forest with only the two most important variables
rf_most_important = RandomForestRegressor(n_estimators= 1000, random_state=42)
# Extract the two most important features
important_indices = [feature_list.index('temp_1'), feature_list.index('average')]
train_important = train_features[:, important_indices]
test_important = test_features[:, important_indices]
# Train the random forest
rf_most_important.fit(train_important, train_labels)
# Make predictions and determine the error
predictions = rf_most_important.predict(test_important)
errors = abs(predictions - test_labels)
# Display the performance metrics
print('Mean Absolute Error:', round(np.mean(errors), 2), 'degrees.')
mape = np.mean(100 * (errors / test_labels))
accuracy = 100 - mape
print('Accuracy:', round(accuracy, 2), '%.')
Mean Absolute Error: 3.9 degrees.
Accuracy: 93.8 %
    
# Import matplotlib for plotting and use magic command for Jupyter Notebooks
import matplotlib.pyplot as plt
%matplotlib inline
# Set the style
plt.style.use('fivethirtyeight')
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical')
# Tick labels for x axis
plt.xticks(x_values, feature_list, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances');


# Use datetime for creating date objects for plotting
import datetime
# Dates of training values
months = features[:, feature_list.index('month')]
days = features[:, feature_list.index('day')]
years = features[:, feature_list.index('year')]
# List and then convert to datetime object
dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in dates]
# Dataframe with true values and dates
true_data = pd.DataFrame(data = {'date': dates, 'actual': labels})
# Dates of predictions
months = test_features[:, feature_list.index('month')]
days = test_features[:, feature_list.index('day')]
years = test_features[:, feature_list.index('year')]
# Column of dates
test_dates = [str(int(year)) + '-' + str(int(month)) + '-' + str(int(day)) for year, month, day in zip(years, months, days)]
# Convert to datetime objects
test_dates = [datetime.datetime.strptime(date, '%Y-%m-%d') for date in test_dates]
# Dataframe with predictions and dates
predictions_data = pd.DataFrame(data = {'date': test_dates, 'prediction': predictions})
# Plot the actual values
plt.plot(true_data['date'], true_data['actual'], 'b-', label = 'actual')
# Plot the predicted values
plt.plot(predictions_data['date'], predictions_data['prediction'], 'ro', label = 'prediction')
plt.xticks(rotation = '60'); 
plt.legend()
# Graph labels
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual and Predicted Values');



# Make the data accessible for plotting
true_data['temp_1'] = features[:, feature_list.index('temp_1')]
true_data['average'] = features[:, feature_list.index('average')]
true_data['friend'] = features[:, feature_list.index('friend')]
# Plot all the data as lines
plt.plot(true_data['date'], true_data['actual'], 'b-', label  = 'actual', alpha = 1.0)
plt.plot(true_data['date'], true_data['temp_1'], 'y-', label  = 'temp_1', alpha = 1.0)
plt.plot(true_data['date'], true_data['average'], 'k-', label = 'average', alpha = 0.8)
plt.plot(true_data['date'], true_data['friend'], 'r-', label = 'friend', alpha = 0.3)
# Formatting plot
plt.legend(); plt.xticks(rotation = '60');
# Lables and title
plt.xlabel('Date'); plt.ylabel('Maximum Temperature (F)'); plt.title('Actual Max Temp and Variables');

### Read Processed Yarn Log

In [24]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [55]:
processedfile=r'C:\Users\sreddy\OneDrive - MerckGroup\New folder\process_logs\application_1580556634479_40389_processed.csv'
header_list = ["level", "message", "lable"]
df=pd.read_csv(processedfile,delimiter="|",names=header_list)
df.shape

(215, 3)

In [56]:
nodupdf=df.drop_duplicates(subset={"level","message","lable"}, keep='first', inplace=False)
nodupdf.shape
errordf=nodupdf[nodupdf['level']=='error']

In [57]:
errordf

Unnamed: 0,level,message,lable
123,error,importprocessstep sqoop import abort! custo...,


In [58]:
#nodupErrorDf=errordf.drop_duplicates(subset={"message"}, keep='first', inplace=False)
#nodupErrorDf

In [59]:
#for i in nodupErrorDf["message"]:
#    print(len(i))

In [60]:
recomDbfile=r'C:\Users\sreddy\OneDrive - MerckGroup\New folder\process_logs\rdb.csv'
header_list = ["message", "solution"]
rdbDF=pd.read_csv(recomDbfile,delimiter="|",names=header_list)
rdbDF.shape

(9, 2)

In [31]:
#vectorizer = TfidfVectorizer()
#tfidf = vectorizer.fit_transform(nodupErrorDf.iloc[0],rdbDF.iloc[0])

In [32]:
#((tfidf * tfidf.T).A)[0,1]

In [33]:
#tfidf1 = vectorizer.fit_transform(nodupErrorDf.iloc[0],rdbDF.iloc[])
#((tfidf1 * tfidf1.T).A)[0,1]
#tfidf1=vectorizer.fit_transform(['applicationmaster user class threw exception invocationtargetexception caused by numberformatexception for input string'],['applicationmaster user class threw exception invocationtargetexception caused by numberformatexception for input string'])
#((tfidf1 * tfidf1.T).A)[0,1]
#(tfidf1 * tfidf1.T).A
#dir(tfidf1)

In [34]:
#Tfidf_scores = []
#score = cosine_similarity(errordf.iloc[0],rdbDF.iloc[0])
#Tfidf_scores.append(score)
#score

### Levenshtein similarity

In [14]:
'''from Levenshtein import ratio
def getApproximateAnswer2():
    max_score = 0
    answer = ""
    prediction = ""
    recomDbfile=r'C:\Users\sreddy\OneDrive - MerckGroup\New folder\process_logs\rdb.csv'
    header_list = ["message", "solution"]
    rdbDF=pd.read_csv(recomDbfile,delimiter="|",names=header_list)
    for i,q in rdbDF.iterrows():
        score = ratio(str(nodupErrorDf.iloc[0]["message"]),str(q["message"]))
        if score >= 0.9: # I'm sure, stop here
            return score,q["solution"], q["message"]
        elif score > max_score: # I'm unsure, continue
            max_score = score
            answer = q["solution"]
            prediction = q["solution"]
        if max_score > 0.3: # threshold is lowered
            return answer, max_score, prediction
        return "Sorry, I didn't get you.", max_score, prediction

ans=getApproximateAnswer2()
ans'''

(1.0,
 ' Inserting string in place of number.Input file data is in wrong format.Please correct and reprocess',
 'applicationmaster user class threw exception invocationtargetexception caused by numberformatexception for input string')

In [61]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import cross_val_score
from sklearn.metrics.pairwise import euclidean_distances

In [62]:
vectorizer = TfidfVectorizer(binary=False,
                     max_df=0.95, 
                     min_df=0.15,
                     ngram_range = (1,10),use_idf = False, norm = None)

In [63]:
rdbMessage_vectors = vectorizer.fit_transform(rdbDF['message'])
#rdbMessage_vectors = vectorizer.fit_transform(rdbDF['message'])
print(rdbMessage_vectors.shape)
print(vectorizer.get_feature_names())

(9, 41)
['applicationmaster', 'applicationmaster user', 'applicationmaster user class', 'applicationmaster user class threw', 'applicationmaster user class threw exception', 'applicationmaster user class threw exception invocationtargetexception', 'applicationmaster user class threw exception invocationtargetexception caused', 'applicationmaster user class threw exception invocationtargetexception caused by', 'by', 'caused', 'caused by', 'class', 'class threw', 'class threw exception', 'class threw exception invocationtargetexception', 'class threw exception invocationtargetexception caused', 'class threw exception invocationtargetexception caused by', 'entity', 'error', 'exception', 'exception invocationtargetexception', 'exception invocationtargetexception caused', 'exception invocationtargetexception caused by', 'for', 'invocationtargetexception', 'invocationtargetexception caused', 'invocationtargetexception caused by', 'not', 'the', 'threw', 'threw exception', 'threw exception inv

In [64]:
results=[]
q_vector = vectorizer.transform([errordf.iloc[0]["message"]])
results.append(cosine_similarity(q_vector, rdbMessage_vectors.toarray()))
results

[array([[0., 0., 0., 0., 0., 0., 0., 0., 0.]])]

In [65]:
flat_score = [item for sublist in results for item in sublist]

In [67]:
np.array(flat_score)

array([[0., 0., 0., 0., 0., 0., 0., 0., 0.]])

In [68]:
is_all_zero = np.all((np.array(flat_score) == 0))
if is_all_zero:
    print("No Similarity Found")

No Similarity Found


In [None]:
import os
os.popen('cp '+ processedfile +' '+ source.txt destination.txt') 

In [49]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [50]:
processedfile=r'C:\Users\sreddy\OneDrive - MerckGroup\New folder\process_logs\1application_1568810042014_190726_processed.csv'
header_list = ["level", "message", "lable"]
df=pd.read_csv(processedfile,delimiter="|",names=header_list)
df.shape

(293, 3)

In [51]:
nodupdf=df.drop_duplicates(subset={"level","message","lable"}, keep='first', inplace=False)
nodupdf.shape
errordf=nodupdf[nodupdf['level']=='error']

In [17]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sreddy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [18]:
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic



dog=wn.synsets('dog', pos=wn.NOUN)[0] #get the first noun synonym of the word "dog"

cat=wn.synsets('cat', pos=wn.NOUN)[0]

rose=wn.synsets('rose', pos=wn.NOUN)[0]

flower=wn.synsets('flower', pos=wn.NOUN)[0]



#brown_ic = wordnet_ic.ic('ic-brown.dat') #load the brown corpus to compute the IC



#rose.res_similarity(flower, brown_ic)

#rose.res_similarity(dog, brown_ic)

#cat.res_similarity(dog, brown_ic)

In [23]:
from nltk.corpus import wordnet_ic
nltk.download('wordnet_ic')
brown_ic = wordnet_ic.ic('ic-brown.dat')

[nltk_data] Downloading package wordnet_ic to
[nltk_data]     C:\Users\sreddy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet_ic.zip.


In [27]:
rose.res_similarity(flower, brown_ic)
rose.res_similarity(dog, brown_ic)
cat.res_similarity(dog, brown_ic)

7.911666509036577

In [52]:
import nltk 
from nltk.corpus import stopwords
import re
set(stopwords.words('english'))

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'only',
 'or',
 'other',
 'our',
 'ours',
 'ourselves',
 'out',
 'over',
 'own',
 'r

In [53]:
def word_extraction(sentence):
    ignore = ['a', "the", "is"]
    words = re.sub("[^\w]", " ",  sentence).split()
    cleaned_text = [w.lower() for w in words if w not in ignore]
    return cleaned_text

In [54]:
def tokenize(sentences):    
    words = []    
    for sentence in sentences:
        w = word_extraction(sentence)
        words.extend(w)            
        words = sorted(list(set(words)))    
        return words

In [55]:
def generate_bow(allsentences):
    vocab = tokenize(allsentences)
    print("Word List for Document \n{0} \n".format(vocab));

In [56]:
allsentences = ["Joe waited for the train train", "The train was late", "Mary and Samantha took the bus","I looked for Mary and Samantha at the bus station","Mary and Samantha arrived at the bus station early but waited until noon for the bus"]

In [59]:
import numpy
vocab = tokenize(allsentences)
for sentence in allsentences:
    words = word_extraction(sentence)
    bag_vector = numpy.zeros(len(vocab))
    for w in words:
        for i,word in enumerate(vocab):
            if word == w:
                bag_vector[i] += 1
                print("{0}\n{1}\n".format(sentence,numpy.array(bag_vector)))

Joe waited for the train train
[0. 1. 0. 0.]

Joe waited for the train train
[0. 1. 0. 1.]

Joe waited for the train train
[1. 1. 0. 1.]

Joe waited for the train train
[1. 1. 1. 1.]

Joe waited for the train train
[1. 1. 2. 1.]

The train was late
[0. 0. 1. 0.]

I looked for Mary and Samantha at the bus station
[1. 0. 0. 0.]

Mary and Samantha arrived at the bus station early but waited until noon for the bus
[0. 0. 0. 1.]

Mary and Samantha arrived at the bus station early but waited until noon for the bus
[1. 0. 0. 1.]



In [60]:
import nltk 
import re 
import numpy as np 
  
# execute the text here as : 
# text = """ # place text here  """ 
dataset = nltk.sent_tokenize(text) 
for i in range(len(dataset)): 
    dataset[i] = dataset[i].lower() 
    dataset[i] = re.sub(r'\W', ' ', dataset[i]) 
    dataset[i] = re.sub(r'\s+', ' ', dataset[i])

NameError: name 'text' is not defined

In [64]:
# Creating the Bag of Words model 
#dataset="We declare a dictionary to hold our bag of words"

word2count = {} 
for data in dataset: 
    words = nltk.word_tokenize(data) 
    for word in words: 
        if word not in word2count.keys(): 
            word2count[word] = 1
        else: 
            word2count[word] += 1

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\sreddy/nltk_data'
    - 'C:\\installed\\anaconda\\nltk_data'
    - 'C:\\installed\\anaconda\\share\\nltk_data'
    - 'C:\\installed\\anaconda\\lib\\nltk_data'
    - 'C:\\Users\\sreddy\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [62]:
import heapq 
freq_words = heapq.nlargest(100, word2count, key=word2count.get)

In [68]:
import nltk  
import numpy as np  
import random  
import string

import bs4 as bs  
import urllib.request  
import re
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sreddy\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [70]:
raw_html = urllib.request.urlopen('https://en.wikipedia.org/wiki/Natural_language_processing')  
raw_html = raw_html.read()

article_html = bs.BeautifulSoup(raw_html, 'lxml')

article_paragraphs = article_html.find_all('p')

article_text = ''

for para in article_paragraphs:  
    article_text += para.text

In [71]:
corpus = nltk.sent_tokenize(article_text)

In [72]:
for i in range(len(corpus )):
    corpus [i] = corpus [i].lower()
    corpus [i] = re.sub(r'\W',' ',corpus [i])
    corpus [i] = re.sub(r'\s+',' ',corpus [i])

In [73]:
print(len(corpus))

58


In [74]:
print(corpus[30])

in the 2010s representation learning and deep neural network style machine learning methods became widespread in natural language processing due in part to a flurry of results showing that such techniques 4 5 can achieve state of the art results in many natural language tasks for example in language modeling 6 parsing 7 8 and many others 


In [75]:
wordfreq = {}
for sentence in corpus:
    tokens = nltk.word_tokenize(sentence)
    for token in tokens:
        if token not in wordfreq.keys():
            wordfreq[token] = 1
        else:
            wordfreq[token] += 1

In [76]:
import heapq
most_freq = heapq.nlargest(200, wordfreq, key=wordfreq.get)

In [77]:
sentence_vectors = []
for sentence in corpus:
    sentence_tokens = nltk.word_tokenize(sentence)
    sent_vec = []
    for token in most_freq:
        if token in sentence_tokens:
            sent_vec.append(1)
        else:
            sent_vec.append(0)
    sentence_vectors.append(sent_vec)

In [78]:
sentence_vectors = np.asarray(sentence_vectors)

In [79]:
sentence_vectors

array([[1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0],
       ...,
       [1, 0, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0]])

#### Debug Recimendation Code

In [2]:
import os
import csv
processedFilePth = r"C:\Users\sreddy\OneDrive - MerckGroup\New folder\data\processted_logs"
processedFileName = "application_1568810042014_190726.csv"

fullprocessedFileName=os.path.join(processedFilePth,processedFileName)

In [99]:
def read_logfile(fullLogFileName):
#    filePath=filePath
#    fileName=fileName
#    fullLogFileName=os.path.join(filePath,fileName)
    fullLogFileName=fullLogFileName
    header_list = ["level", "message"]
    log_df=pd.read_csv(fullLogFileName,delimiter="|")
    return log_df

In [102]:
def dedupError(inputDf):
    nodupdf=inputDf.drop_duplicates(subset={0,1}, keep='first', inplace=False)
    errordf=nodupdf[nodupdf[1]=='error']
    return errordf

In [101]:

#fullprocessedFileName=os.path.join(processedFilePth,processedFileName)

logDf=read_logfile(fullprocessedFileName)
kbDf=read_kb(DBFullPath)
#dedupLogDf=dedupError(logDf)    
vectorizer = TfidfVectorizer(binary=False,max_df=0.95,min_df=0.15,
                             ngram_range = (1,10),use_idf = False, norm = None)

kbVecDf = vectorizer.fit_transform(kbDf['message'])
#logVecDf = vectorizer.transform([dedupLogDf.iloc[0]["message"]])    
#similarityMatrix=findSimilarity(logVecDf,kbVecDf)

In [108]:
#dedupLogDf=dedupError(logDf)
#nodupdf=logDf.drop_duplicates(subset={0,1}, keep='first', inplace=False)
logDf.iloc[2]

info                                                                                                                           info
 coarsegrainedexecutorbackend: registered signal handlers for [term, hup, int]     securitymanager: securitymanager: authenticat...
                                                                                                                                   
 .1                                                                                                                                
Name: 2, dtype: object

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from processYarnLog import preprocessLog
from readYarnLog import read_yarnlog
from recomendationEngin1 import read_logfile,read_kb,dedupError,findSimilarity,isSimilarityFound,recomendSolution
DBFullPath=r'C:\Users\sreddy\OneDrive - MerckGroup\New folder\process_logs\rdb.csv'
toBeAnalyzed=r'C:\Users\sreddy\OneDrive - MerckGroup\New folder\tobeanalyzed'
logDf=read_logfile(fullprocessedFileName)
kbDf=read_kb(DBFullPath)
dedupLogDf=dedupError(logDf)    
vectorizer = TfidfVectorizer(binary=False,max_df=0.95,min_df=0.15,ngram_range = (1,10),use_idf = False, norm = None)

In [4]:
kbVecDf = vectorizer.fit_transform(kbDf['message'])

In [5]:
import pandas as pd

fullLogFileName=fullprocessedFileName
header_list = ["level", "message"]
log_df=pd.read_csv(fullprocessedFileName,delimiter="|")

In [10]:
log_df.columns =["level", "message","d1","d2"]

In [11]:
log_df[["level", "message"]]

Unnamed: 0,level,message
0,info,securitymanager: changing view acls to:TOK_SUID
1,info,securitymanager: changing modify acls to:TOK_...
2,info,securitymanager: securitymanager: authenticat...
3,info,securitymanager: changing view acls to:TOK_SUID
4,info,securitymanager: changing modify acls to:TOK_...
5,info,securitymanager: securitymanager: authenticat...
6,info,slf0jlogger: slf0jlogger started
7,info,remoting: starting remoting
8,info,remoting: remoting started; listening on addr...
9,info,utils: successfully started service 'sparkexe...


In [82]:
header_list = ["0","level", "message"]
data = pd.read_csv(fullprocessedFileName,delimiter="|",usecols=[0,1]) 

In [83]:
data


Unnamed: 0,info,"coarsegrainedexecutorbackend: registered signal handlers for [term, hup, int]"
0,info,securitymanager: changing view acls to:TOK_SUID
1,info,securitymanager: changing modify acls to:TOK_...
2,info,securitymanager: securitymanager: authenticat...
3,info,securitymanager: changing view acls to:TOK_SUID
4,info,securitymanager: changing modify acls to:TOK_...
5,info,securitymanager: securitymanager: authenticat...
6,info,slf0jlogger: slf0jlogger started
7,info,remoting: starting remoting
8,info,remoting: remoting started; listening on addr...
9,info,utils: successfully started service 'sparkexe...
