In [1]:
import datetime
import pickle
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel,cosine_similarity
stops = set(stopwords.words("english"))

In [15]:
def get_data():
    raw = [
        {"Title":"Python",
            "Body": "Python is interpreter languange"
            },
            {"Title":"Trump",
          "Body":"Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin"
        },
        {"Title":"Python",
            "Body": "Python is interpreter as well as scripting languange"
            },
        {"Title":"Tatya",
            "Body": "President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election"
        },
        {"Title":"Java",
            "Body": "Java is compiler language"
        },
        {"Title":"C",
            "Body": "C is compiler Languange"
        },

          ]
    df = pd.DataFrame(raw)
    df['Id'] =[i for i in range(len(raw))]

    # tickets_corpus = df.T.to_dict().values()
    tickets_corpus = df.to_dict('records')
    return tickets_corpus

In [16]:
word_lem = WordNetLemmatizer()
def clean_document(document_of_words):
    document_of_words = word_tokenize(document_of_words)
    # remove stop words
    document_of_words = [w.lower() for w in document_of_words if not w in stops]
    
    # stem each word
    # stemmed_words = [word_lem.lemmatize(word) for word in document_of_words]
     
    return ' '.join(document_of_words)

tickets_corpus = get_data()
for ticket_dict in tickets_corpus:
    word = clean_document((str(ticket_dict['Title'])+" "+str(ticket_dict['Body'])))
    print(word)

python python interpreter languange
trump mr. trump became president winning political election . though lost support republican friends , trump friends president putin
python python interpreter well scripting languange
tatya president trump says putin political interference election outcome . he says witchhunt political parties . he claimed president putin friend nothing election
java java compiler language
c c compiler languange


In [17]:
# NOTE : Currently only picks title and summary as relevant data from a ticket.
def extract_clean_documents_from_corpus(tickets_corpus):
    print("Extracting and Cleaning documents...")
    final_corpus = []
    list_of_docs = []
    i = 0
    for ticket_dict in tickets_corpus:
        # print(ticket_dict)
        document_of_words = (str(ticket_dict['Title'])+" "+str(ticket_dict['Body']))
        #print(document_of_words)
        doc_cleaned_text = clean_document(document_of_words)
        # print(doc_cleaned_text)
        list_of_docs.append(doc_cleaned_text)
        final_corpus.append({'que_id':ticket_dict['Id'], 'words':doc_cleaned_text, 'index':i})
        i+=1
    return list_of_docs,final_corpus


In [18]:
def save_tfidf_model(tickets_corpus, output_file_name_without_extn="stack_model1"):
    tfidf_model = TfidfVectorizer()
    list_of_docs,training_ticket_corpus = extract_clean_documents_from_corpus(tickets_corpus)
    tfidf_trainingset = tfidf_model.fit_transform(list_of_docs)
    trained_model_and_data_dict = {'model':tfidf_model, 'trained_data':tfidf_trainingset, 'corpus':training_ticket_corpus}
    model_name_with_path = output_file_name_without_extn+".pickle"
    pickle.dump(trained_model_and_data_dict, open(model_name_with_path, "wb"))
    
    return model_name_with_path

In [19]:
def load_model(model_file_path):
    with open(model_file_path, 'rb') as pickled_file:
        loaded_model_data = pickle.load(pickled_file)
    return loaded_model_data['model'],loaded_model_data['trained_data'],loaded_model_data['corpus']

In [24]:
def find_similar_tickets(num_of_related_tickets_to_return, input_tickets_corpus, model_file_path):
    model, trained_data_vector, trained_data_corpus = load_model(model_file_path)
#     print("trained corpus",len(trained_data_corpus))
    related_tickets_data = []
    for ticket in input_tickets_corpus:
        ticket_data = ticket['Title']
        if ticket['Body'] is not None:
            ticket_data += ticket['Body']
        
        # Clean the unwanted data
        ticket_data = clean_document(ticket_data)
        
        # transform tickets data into vector
        test_data_vector = model.transform([ticket_data])
        
        # Find the similarity using Cosine Kernel
        cosine_similarities = cosine_similarity(test_data_vector, trained_data_vector).flatten()
        related_ticket_indices = cosine_similarities.argsort()[:-num_of_related_tickets_to_return-1:-1]
#         print(len(related_ticket_indices))
        
        
        related_tickets_dict = {}
        related_tickets_dict['que_id'] = ticket['Id']
        related_tickets_dict['related_tickets'] = [trained_data_corpus[i]['que_id'] for i in related_ticket_indices]
        related_tickets_data.append(related_tickets_dict)
    return related_tickets_data

In [23]:
# def get_testing_data():
#     raw = [
#         {"Title":"Python",
#             "Body": "Python is interpreter languange"
#         },
#         {"Title":"Trump",
#           "Body":"Mr. Trump became president after winning the political election. Though he lost the support of some republican friends, Trump is friends with President Putin"
#         },
#         {"Title":"Python",
#             "Body": "Python is interpreter as well as scripting languange"
#             },
#         {"Title":"Tatya",
#             "Body": "President Trump says Putin had no political interference is the election outcome. He says it was a witchhunt by political parties. He claimed President Putin is a friend who had nothing to do with the election"
#         },
#         {"Title":"Java",
#             "Body": "Java is compiler language"
#         },
#         {"Title":"C",
#             "Body": "C is Languange"
#         },
#           ]
#     df = pd.DataFrame(raw)
#     df['Id'] =[i for i in range(len(raw))]

#     # tickets_corpus = df.T.to_dict().values()
#     df.head()
#     new_tickets_corpus = df.to_dict('records')
#     return new_tickets_corpus
# # print(new_tickets_corpus)
# # find top N related tickets
# related_tickets_data = find_similar_tickets(2, new_tickets_corpus, "stack_model.pickle")
# print(related_tickets_data)

In [21]:
# get data
tickets_corpus = get_data()
# Train model
model_name_with_path = save_tfidf_model(tickets_corpus)
print(model_name_with_path)

Extracting and Cleaning documents...
stack_model1.pickle


In [25]:
related_tickets_data = find_similar_tickets(2, tickets_corpus, model_name_with_path)
print(related_tickets_data)

[{'que_id': 0, 'related_tickets': [0, 2]}, {'que_id': 1, 'related_tickets': [1, 3]}, {'que_id': 2, 'related_tickets': [2, 0]}, {'que_id': 3, 'related_tickets': [3, 1]}, {'que_id': 4, 'related_tickets': [4, 5]}, {'que_id': 5, 'related_tickets': [5, 4]}]
