In [1]:
import numpy as np
import pandas as pd
import re
import time
from datasketch import MinHash, MinHashLSHForest

In [2]:
jira_stories=pd.read_csv('D:/dataIssue/Jira_FinalDs2.csv')
jira_stories

Unnamed: 0.1,Unnamed: 0,Heading_Phrase,Other_Phrase,Clean_Description,storypoints
0,3,application,auto,servicing application keeps auto refreshing wh...,3.0
1,17,ui review,sprint,data team transaction import after uploading t...,3.0
2,18,transactions,is received transfer group asset data,transfer in transactions when data for an asse...,3.0
3,19,Transaction Type,transfer,transaction type transfer in transcode id buck...,3.0
4,20,Purchase Transactions,interest accruals acquisition date asset,purchase transactions interest accruals when c...,5.0
5,38,purchase transactions,data import is received asset,purchase transactions when asset is received t...,3.0
6,39,to view the event details,asset list user events,functional specifications user should be able ...,5.0
7,40,get populated on the date,redemption date gets paid asset,redemption date should get populated on the da...,3.0
8,41,boarding Assets,data import,whenever data set is uploaded by the data team...,5.0
9,44,liens data,data team loans,user will have an option to upload the data by...,5.0


In [3]:
jira_stories.drop('Unnamed: 0',axis=1,inplace=True)

In [4]:
jira_stories

Unnamed: 0,Heading_Phrase,Other_Phrase,Clean_Description,storypoints
0,application,auto,servicing application keeps auto refreshing wh...,3.0
1,ui review,sprint,data team transaction import after uploading t...,3.0
2,transactions,is received transfer group asset data,transfer in transactions when data for an asse...,3.0
3,Transaction Type,transfer,transaction type transfer in transcode id buck...,3.0
4,Purchase Transactions,interest accruals acquisition date asset,purchase transactions interest accruals when c...,5.0
5,purchase transactions,data import is received asset,purchase transactions when asset is received t...,3.0
6,to view the event details,asset list user events,functional specifications user should be able ...,5.0
7,get populated on the date,redemption date gets paid asset,redemption date should get populated on the da...,3.0
8,boarding Assets,data import,whenever data set is uploaded by the data team...,5.0
9,liens data,data team loans,user will have an option to upload the data by...,5.0


In [5]:
def preprocess(text):
    text = re.sub(r'[^\w\s]','',text)
    tokens = text.lower()
    tokens = tokens.split()
    return tokens

In [6]:
permutations = 128

In [7]:
def get_forest(data, perms):
    start_time = time.time()
    
    minhash = []
    
    for text in data['text']:
        tokens = preprocess(text)
        m = MinHash(num_perm=perms)
        for s in tokens:
            m.update(s.encode('utf8'))
        minhash.append(m)
        
    forest = MinHashLSHForest(num_perm=perms)
    
    for i,m in enumerate(minhash):
        forest.add(i,m)
        
    forest.index()
    
    print('It took %s seconds to build forest.' %(time.time()-start_time))
    
    return forest

In [8]:
def predict(text, database, perms, num_results, forest):
    start_time = time.time()
    
    tokens = preprocess(text)
    m = MinHash(num_perm=perms)
    for s in tokens:
        m.update(s.encode('utf8'))
        
    idx_array = np.array(forest.query(m, num_results))
    if len(idx_array) == 0:
        return None # if your query is empty, return none
    
    result = database.iloc[idx_array]['storypoints']
    
    print('It took %s seconds to query forest.' %(time.time()-start_time))
    
    return result

In [9]:
db = pd.read_csv('D:/dataIssue/Jira_FinalDs2.csv')

In [10]:
# for i in range(0,len(db['storypoints'])):
#     db['storypoints'][i] = str(db['storypoints'][i])
db['storypoints'] = db['storypoints'].astype(str)

In [11]:
db['storypoints'][0]

'3.0'

In [12]:
db['text'] = db['Heading_Phrase'] + ' ' + db['Other_Phrase'] + ' ' + db['Clean_Description'] + ' ' + db['storypoints']
forest = get_forest(db, permutations)

It took 2.9665560722351074 seconds to build forest.


In [13]:
num_recommendations = 2

In [14]:
text = ''' ui review sprint data team transaction import after uploading the file it not showing in uploaded history table data team data import error link should be clickable published status should come according to action status'''
result = predict(text, db, 128 , num_recommendations, forest)
print('\n Top Recommendation(s) is(are) \n', result)

It took 0.010993719100952148 seconds to query forest.

 Top Recommendation(s) is(are) 
 1     3.0
19    5.0
Name: storypoints, dtype: object


In [15]:
result

1     3.0
19    5.0
Name: storypoints, dtype: object