In [3]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
import pandas as pd
import numpy as np

In [4]:
import os
from nltk.parse import stanford
os.environ['STANFORD_PARSER'] = '../stanford-parser-full-2018-10-17/stanford-parser.jar'
os.environ['STANFORD_MODELS'] = '../stanford-parser-full-2018-10-17/stanford-parser-3.9.2-models.jar'

parser = stanford.StanfordParser(model_path="../stanford-parser-full-2018-10-17/englishPCFG.ser")
sentences = parser.raw_parse_sents(("How many runs were scored by RR Pant between kings and daredevils on 8/4/2018?",))
print(sentences)

# GUI
for line in sentences:
    for sentence in line:
        print(sentence)

Please use [91mnltk.parse.corenlp.CoreNLPParser[0m instead.
  


<list_iterator object at 0x7f46f094afd0>
(ROOT
  (SBARQ
    (WHNP (WHADJP (WRB How) (JJ many)) (NNS runs))
    (SQ
      (VBD were)
      (VP
        (VBN scored)
        (PP
          (IN by)
          (NP
            (NP (NNP RR) (NNP Pant))
            (PP
              (IN between)
              (NP (NNS kings) (CC and) (NNS daredevils)))))
        (PP (IN on) (NP (CD 8/4/2018)))))
    (. ?)))


In [2]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")
text = ("How many runs were scored by RR Pant between Kings XI and Daredevils on 8/4/2018?")
doc = nlp(text)

In [3]:
# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

Noun phrases: ['How many runs', 'RR Pant', 'Kings XI', 'Daredevils']
Verbs: ['be', 'score']


In [4]:
# Find named entities, phrases and concepts
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

RR Pant 29 36 ORG
Kings XI 45 53 PERSON


In [5]:
for token in doc:
    print(token.text, token.lemma_, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop)

How how ADV WRB advmod Xxx True True
many many ADJ JJ amod xxxx True True
runs run NOUN NNS nsubjpass xxxx True False
were be VERB VBD auxpass xxxx True True
scored score VERB VBN ROOT xxxx True False
by by ADP IN agent xx True True
RR RR PROPN NNP compound XX True False
Pant Pant PROPN NNP pobj Xxxx True False
between between ADP IN prep xxxx True True
Kings king NOUN NNS compound Xxxxx True False
XI XI PROPN NNP pobj XX True False
and and CCONJ CC cc xxx True True
Daredevils Daredevils PROPN NNP conj Xxxxx True False
on on ADP IN prep xx True True
8/4/2018 8/4/2018 NUM CD pobj d/d/dddd False False
? ? PUNCT . punct ? False False


In [6]:
print("Noun chunks")
for chunk in doc.noun_chunks:
    print(chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text)

Noun chunks
How many runs runs nsubjpass scored
RR Pant Pant pobj by
Kings XI XI pobj between
Daredevils Daredevils conj XI


In [7]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])

How advmod many ADJ []
many amod runs NOUN [How]
runs nsubjpass scored VERB [many]
were auxpass scored VERB []
scored ROOT scored VERB [runs, were, by, on, ?]
by agent scored VERB [Pant]
RR compound Pant PROPN []
Pant pobj by ADP [RR, between]
between prep Pant PROPN [XI]
Kings compound XI PROPN []
XI pobj between ADP [Kings, and, Daredevils]
and cc XI PROPN []
Daredevils conj XI PROPN []
on prep scored VERB [8/4/2018]
8/4/2018 pobj on ADP []
? punct scored VERB []


In [8]:
displacy.render(doc, style='dep')

In [9]:
displacy.render(doc, style='ent')

In [10]:
# tempr = set()
# import re
# for x in batsmen_r:
#     batsman = (re.split("[^a-zA-Z\s]", x)[0]).strip()
#     tempr.add('{"label": "PLAYER", "pattern":"' + batsman + '"},')
# for x in bowlers:
#     tempr.add('{"label": "PLAYER", "pattern":"' + x + '"},')
# for x in tempr:
#     print(x)

NameError: name 'batsmen_r' is not defined

In [14]:
from spacy.lang.en import English
from spacy.pipeline import EntityRuler

# nlp = English()
nlp = spacy.load("en_core_web_sm")
ruler = EntityRuler(nlp, overwrite_ents=True).from_disk("/home/trip3r/Research/cricket-scoreboard-scrape/data/patterns.jsonl")
# ruler.add_patterns(patterns)
nlp.add_pipe(ruler)

# doc = nlp(u"Apple is opening its first big office in San Francisco.")

# doc = nlp("How many runs were scored by RR Pant between Kings XI and Daredevils on 8/4/2018?")
# print([(ent.text, ent.label_) for ent in doc.ents])

In [15]:
with open('../data/queslist.txt', 'r') as f:
    x = f.readlines()
    for t in range(0,15000,500):
        tx = x[t].strip()
        doc = nlp(tx)
#         print(tx)
#         print([(ent.text, ent.label_) for ent in doc.ents])
        displacy.render(doc, style='ent')

In [16]:
displacy.render(doc, style='dep')

In [6]:
# A Dynamic Programming based Python program for edit 
# distance problem 
def editDistDP(str1, str2):
    m = len(str1)
    n = len(str2)
    # Create a table to store results of subproblems 
    dp = [[0 for x in range(n+1)] for x in range(m+1)] 
  
    # Fill d[][] in bottom up manner 
    for i in range(m+1): 
        for j in range(n+1): 
  
            # If first string is empty, only option is to 
            # insert all characters of second string 
            if i == 0: 
                dp[i][j] = j    # Min. operations = j 
  
            # If second string is empty, only option is to 
            # remove all characters of second string 
            elif j == 0: 
                dp[i][j] = i    # Min. operations = i 
  
            # If last characters are same, ignore last char 
            # and recur for remaining string 
            elif str1[i-1] == str2[j-1]: 
                dp[i][j] = dp[i-1][j-1] 
  
            # If last character are different, consider all 
            # possibilities and find minimum 
            else: 
                dp[i][j] = 1 + min(dp[i][j-1],        # Insert 
                                   dp[i-1][j],        # Remove 
                                   dp[i-1][j-1])    # Replace 
  
    return dp[m][n] 

def minEditDist(noun, entities):
    return min([editDistDP(noun, x) for x in entities])

def minEditEnt(noun, entities, dist):
    for x in entities:
        if editDistDP(noun, x) == dist:
            return x

In [7]:
# name = "Hyderabad"
# entityLists = [teams, grounds, batsmen, bowlers]
# temp = [minEditDist(name, entList) for entList in entityLists]
# temp1 = entityLists[np.argmin(temp)]
# minEditEnt(name, temp1, np.min(temp))
# # print(entityLists.index(min(temp)))

In [55]:
def preprocess(query):
    st = nltk.word_tokenize(query)
    st = nltk.pos_tag(st)
    return st

def ExtractPhrases(tree, phrase):
    myPhrases = []
    if (tree.label() == phrase):
        myPhrases.append( tree.copy(True))
    for child in tree:
        if (type(child) is nltk.tree.Tree):
            list_of_phrases = ExtractPhrases(child, phrase)
            if (len(list_of_phrases) > 0):
                myPhrases.extend(list_of_phrases)
    return myPhrases

pattern = """
        NP: {<JJ>*<NN>*<NNS>*<NNP>*<NNPS>*}
        Date: {<CD>}
            """
teams = ['Mum Indians', 'Kings XI', 'KKR', 'Sunrisers', 'Super Kings', 'Royals', 'RCB', 'Daredevils']
grounds = ['Mumbai', 'Mohali', 'Kolkata', 'Hyderabad (Deccan)', 'Chennai', 'Jaipur', 'Bengaluru', 'Pune', 'Delhi', 'Indore']
batsmen_r = ['KL Rahul †', 'CH Gayle', 'Yuvraj Singh', 'KK Nair', 'AR Patel', 'MP Stoinis', 'MA Agarwal', 'R Ashwin (c)', 'SA Yadav', 'E Lewis', 'Ishan Kishan †', 'HH Pandya', 'RG Sharma (c)', 'KH Pandya', 'PP Shaw', 'GJ Maxwell', 'SS Iyer (c)', 'RR Pant †', 'V Shankar', 'Abhishek Sharma', 'KA Pollard', 'BCJ Cutting', 'M Markande', 'JJ Bumrah', 'Mustafizur Rahman', 'C Munro', 'LE Plunkett', 'DJM Short', 'JC Buttler †', 'SV Samson', 'BA Stokes', 'RA Tripathi', 'K Gowtham', 'JC Archer', 'SR Watson', 'AT Rayudu', 'SK Raina', 'MS Dhoni (c) †', 'SW Billings', 'DJ Bravo', 'RA Jadeja', 'AM Rahane (c)', 'H Klaasen', 'STR Binny', 'S Gopal', 'JD Unadkat', 'B Laughlin', 'AJ Finch', 'AJ Tye', 'MM Sharma', 'Mujeeb Ur Rahman', 'Q de Kock †', 'BB McCullum', 'V Kohli (c)', 'AB de Villiers', 'SN Khan', 'Mandeep Singh', 'CR Woakes', 'Washington Sundar', 'PJ Sangwan', 'WP Saha †', 'S Dhawan', 'KS Williamson (c)', 'MK Pandey', 'Shakib Al Hasan', 'DJ Hooda', 'YK Pathan', 'Rashid Khan', 'S Kaul', 'Sandeep Sharma', 'B Stanlake', 'CA Lynn', 'SP Narine', 'RV Uthappa', 'N Rana', 'KD Karthik (c) †', 'AD Russell', 'Shubman Gill', 'TK Curran', 'PP Chawla', 'Shivam Mavi', 'G Gambhir (c)', 'JJ Roy', 'SS Iyer', 'R Tewatia', 'CH Morris', 'Mohammed Shami', 'S Nadeem', 'TA Boult', 'PA Patel †', 'MM Ali', 'C de Grandhomme', 'TG Southee', 'AD Hales', 'CJ Anderson', 'P Negi', 'UT Yadav', 'Mohammed Siraj', 'AS Rajpoot', 'JPR Scantlebury-Searles', 'H Klaasen †', 'SP Goswami †', 'CR Brathwaite', 'B Kumar', 'RK Singh', 'R Vinay Kumar', 'NV Ojha', 'DT Christian', 'MJ McClenaghan', 'F du Plessis', 'Basil Thampi', 'MK Lomror', 'MK Tiwary', 'DA Miller', 'BB Sran', 'A Mishra', 'M Vohra', 'JP Duminy', 'Mohammad Nabi', 'HV Patel', 'IS Sodhi', 'Anureet Singh', 'DS Kulkarni', 'M Ashwin', 'DR Shorey', 'M Vijay', 'KV Sharma', 'MG Johnson', 'DL Chahar', 'AD Nath', 'Harbhajan Singh', 'SN Thakur', 'YS Chahal', 'A Dananjaya', 'Kuldeep Yadav', 'M Prasidh Krishna', 'RK Bhui', 'P Chopra', 'KM Jadhav', 'MA Wood', 'Imran Tahir']
batsmen = []
for x in batsmen_r:
    batsmen.append(' '.join(x.split(' ')[:2]))
bowlers = ['R Ashwin', 'MM Sharma', 'Mujeeb Ur Rahman', 'AR Patel', 'AJ Tye', 'MP Stoinis', 'TA Boult', 'Mohammed Shami', 'A Mishra', 'CH Morris', 'DT Christian', 'R Tewatia', 'N Rana', 'M Prasidh Krishna', 'MG Johnson', 'SP Narine', 'PP Chawla', 'Kuldeep Yadav', 'AD Russell', 'MJ McClenaghan', 'JJ Bumrah', 'HH Pandya', 'KH Pandya', 'M Markande', 'BCJ Cutting', 'Sandeep Sharma', 'B Stanlake', 'Rashid Khan', 'S Kaul', 'Shakib Al Hasan', 'PJ Sangwan', 'Mustafizur Rahman', 'JP Duminy', 'TG Southee', 'UT Yadav', 'Mohammed Siraj', 'YS Chahal', 'Washington Sundar', 'C de Grandhomme', 'Shivam Mavi', 'B Kumar', 'KK Ahmed', 'CR Brathwaite', 'S Nadeem', 'V Shankar', 'TK Curran', 'BB Sran', 'JPR Scantlebury-Searles', 'Basil Thampi', 'MM Ali', 'DS Kulkarni', 'JD Unadkat', 'K Gowtham', 'B Laughlin', 'S Gopal', 'BA Stokes', 'DJM Short', 'DL Chahar', 'L Ngidi', 'Harbhajan Singh', 'SN Thakur', 'DJ Bravo', 'RA Jadeja', 'AS Rajpoot', 'JC Archer', 'Yuvraj Singh', 'SR Watson', 'MA Wood', 'Imran Tahir', 'Anureet Singh', 'Avesh Khan', 'LE Plunkett', 'GJ Maxwell', 'IS Sodhi', 'S Lamichhane', 'CJ Dala', 'HV Patel', 'M Ashwin', 'DJ Hooda', 'KV Sharma', 'MK Tiwary', 'Mohammad Nabi', 'KM Asif', 'MK Lomror', 'YK Pathan', 'CJ Jordan', 'CR Woakes', 'K Khejroliya', 'A Dananjaya', 'DJ Willey', 'CJ Anderson', 'R Vinay Kumar', 'STR Binny', 'P Negi', 'Ankit Sharma']

In [9]:
# Find winner
def type1(team1, team2, flag, ground='', date=''):
    df = pd.read_csv('../data/match-data.csv')
    x = df
    ff_filter = []
    
    if ground != '' and date != '':
        ff_filter = ((x['Ground'] == ground) & (x['Match Date'] == date))
    elif ground != '':
        ff_filter = (x['Ground'] == ground)
    elif date != '':
        ff_filter = (x['Match Date'] == date)
    x = x[ff_filter]
    team_filter = ((x['Team 1'] == team1) & (x['Team 2'] == team2)) | ((x['Team 1'] == team2) & (x['Team 2'] == team1))
    x = x[team_filter]
    res = ""
    if flag == 1:
        for i in x['Winner'].values:
            if res != "":
                res += ", "
            res += i
    else:
        if team1 == str(x['Winner'].values[0]):
            res = team2
        else:
            res = team1
#     print('Here?')
#     print(res)
    res += " won by margin of " + str(x['Margin'].values[0])
    return res

In [10]:
# given date and team names return the ground.
def type2(team1, team2, ground = '', date=''):
    df = pd.read_csv('../data/match-data.csv')
    x = df
    ff_filter = []
    if date != '':
        ff_filter = (df['Match Date'] == date)
    if ground != '':
        ff_filter = (df['Ground'] == ground)
    x = x[ff_filter]
#     print(x)
    team_filter = ((x['Team 1'] == team1) & (x['Team 2'] == team2)) | ((x['Team 1'] == team2) & (x['Team 2'] == team1))
    x = x[team_filter]
    res = ""
    if ground == '':
        for i in x['Ground'].values:
            if res != "":
                res += ', '
            res += i
            
    if date == '':
        for i in x['Match Date'].values:
            if res != "":
                res += ', '
            res += i
#     res = x
    return res

In [11]:
def type3(team1, team2, batsman, ground='', date=''):
    df = pd.read_csv('../data/match-data.csv')
    x = df
    ff_filter = []
    if ground != '' and date != '':
        ff_filter = ((df['Ground'] == ground) & (df['Match Date'] == date))
    elif ground != '':
        ff_filter = (df['Ground'] == ground)
    elif date != '':
        ff_filter = (df['Match Date'] == date)
    x = x[ff_filter]
    team_filter = ((x['Team 1'] == team1) & (x['Team 2'] == team2)) | ((x['Team 1'] == team2) & (x['Team 2'] == team1))
    x = x[team_filter]
    return x.index.values[0]

In [12]:
def type4(team1, team2, ground, bowler):
    df = pd.read_csv('../data/match-data.csv')
    x = df
    ground_filter = (df['Ground'] == ground)
    x = x[ground_filter]
    team_filter = ((x['Team 1'] == team1) & (x['Team 2'] == team2)) | ((x['Team 1'] == team2) & (x['Team 2'] == team1))
    x = x[team_filter]
    return x.index.values[0]

In [13]:
def NLInput(query):
    tokens = nltk.word_tokenize(query)
    pp_tokens = preprocess(query)
    return tokens, pp_tokens

def quesProc(pp_sent, pattern):
    custParser = nltk.RegexpParser(pattern)
    tree = custParser.parse(pp_sent)
    return tree

def extractCols(tree):
    lst = ExtractPhrases(tree, 'NP')
    date = ExtractPhrases(tree, 'Date')
    if date != []:
        date = date[0][0][0]
    noun_phrase = []
    for l in lst:
        temp = ""
        for i in range(len(l)):
            temp += l[i][0]
            temp += " "
        temp = temp.strip()
        noun_phrase.append(temp)
    return noun_phrase, date

def queryTable(sent, nouns, date, ):
    no_of_teams = 0
    team1 = ""
    team2 = ""
    ground = ""
    batsman = ""
    bowler = ""
    player = ""
    entityLists = [teams, grounds, batsmen, bowlers]
    for name in nouns:
        if name in teams:
            if team1 == '':
                team1 = name
                no_of_teams+=1
            else:
                team2 = name
                no_of_teams+=1
        elif name in grounds:
            ground = name
        elif name in batsmen and name in bowlers:
            player = name
        elif name in batsmen:
            batsman = name
        elif name in bowlers:
            bowler = name
    res = ""
    formatted_date = ""
#     print(date)
    if date != []:
#         print(date)
        temp = date.split('/')
        if (int(temp[1]) == 4) or (int(temp[1]) == "04"):
            formatted_date += "Apr "
        else:
            formatted_date += "May "
        formatted_date += str(temp[0])
        formatted_date += ", "
        formatted_date += temp[2]

    if no_of_teams == 2 and (formatted_date != "" or ground != "") and ("Where" in sent or "When" in sent or "Which" in sent):
        res = type2(team1, team2, ground, formatted_date)
        
    elif no_of_teams == 2 and (ground != "" or formatted_date != "") and ("What" in sent or "How" in sent):
        x = ""
        if batsman != "" and ("by" in sent or "of" in sent):
            ind = type3(team1, team2, batsman, ground, formatted_date)
            df = pd.read_csv('../data/scoreboard-match-' + str(ind + 1) + '-batting.csv')
            x = df 
            batting_filter = (df['BATSMEN'].str.contains(batsman))
            x = x[batting_filter]  
            
        elif bowler != "" and ("bowl" in sent or "against" in sent):
            ind = type3(team1, team2, bowler, ground, formatted_date)
            df = pd.read_csv('../data/scoreboard-match-' + str(ind + 1) + '-bowling.csv')
            x = df
            bowling_filter = (df['Bowling'].str.contains(bowler))
            x = x[bowling_filter]
        if ("runs" in sent or "score" in sent):
            res = x['R'].values[0]
        elif ('no' in sent and 'balls' in sent):
            res = x['NB'].values[0]
        elif ("wicket" in sent):
            res = x['W'].values[0]
        elif ("overs" in sent):
            res = x['O'].values[0]
        elif ('wide' in sent):
            res = x['WD'].values[0]
        elif ("fours" in sent):
            res = x['4s'].values[0]
        elif ("sixes" in sent):
            res = x['6s'].values[0]
        elif ('rate' in sent):
            res = x['SR'].values[0]
        elif ('balls' in sent):
            res = x['B'].values[0]
            
    else:
        for x in sent:
            if (x == "winner") or (x == "won"):
                res = type1(team1, team2, 1, ground, formatted_date)
                break
            elif (x == "lost") or (x == "loser"):
                res = type1(team1, team2, 2, ground, formatted_date)
                break
    return res

In [16]:
def Hub(questions):
    for NLquery in questions:
        sent, pp_sent = NLInput(NLquery)
#         print(pp_sent)
        tree = quesProc(pp_sent, pattern)
        print(tree)
        nouns, date = extractCols(tree)
#         print(nouns, date)
        res = queryTable(sent, nouns, date)
        print("Q:",NLquery)
        print("A:",res)
        print()

In [28]:
sent, pp_sent = NLInput("For which player his rebounce is two and points is three")
tree = quesProc(pp_sent, pattern)
nltk.draw.tree.draw_trees(tree)

In [18]:
questions = [
    # Type 1
    "Who was the winner between Mum Indians and Super Kings at Mumbai?",
    "Who was the winner between Kings XI and Daredevils on 8/4/2018?",
    # Type 2
    "Where was the match between KKR & RCB held on 8/04/2018?",
    "When was the match between RCB & Kings XI at Bengaluru?",
    # Type 3
    "How many runs were scored by RR Pant between Kings XI and Daredevils on 8/4/2018?",
    "How many fours were scored by RR Pant between Kings XI and Daredevils on 8/4/2018?",
    "How many sixes were scored by RR Pant between Kings XI and Daredevils on 8/4/2018?",
    "How many balls were played by RR Pant between Kings XI and Daredevils on 8/4/2018?",
    "What was the strike rate of RR Pant between Kings XI and Daredevils on 8/4/2018?",
    # Type 4
    "How many runs were scored against R Tewatia between Kings XI and Daredevils on 8/4/2018?",
    "How many fours were scored against R Tewatia between Kings XI and Daredevils on 8/4/2018?",
    "How many sixes were scored against R Tewatia between Kings XI and Daredevils on 8/4/2018?",
    "How many overs did R Tewatia bowl between Kings XI and Daredevils on 8/4/2018?",
    "How many no balls did R Tewatia bowl between Kings XI and Daredevils on 8/4/2018?",
    "How many wide balls did R Tewatia bowl between Kings XI and Daredevils on 8/4/2018?",
    "For which plyaer his rebounce is two and points is three",
]
Hub(questions)

(S
  Who/WP
  was/VBD
  the/DT
  (NP winner/NN)
  between/IN
  (NP Mum/NNP Indians/NNPS)
  and/CC
  (NP Super/NNP Kings/NNP)
  at/IN
  (NP Mumbai/NNP)
  ?/.)
Q: Who was the winner between Mum Indians and Super Kings at Mumbai?
A: Super Kings won by margin of 1 wicket

(S
  Who/WP
  was/VBD
  the/DT
  (NP winner/NN)
  between/IN
  (NP Kings/NNP XI/NNP)
  and/CC
  (NP Daredevils/NNP)
  on/IN
  (Date 8/4/2018/CD)
  ?/.)
Q: Who was the winner between Kings XI and Daredevils on 8/4/2018?
A: Kings XI won by margin of 6 wickets

(S
  Where/WRB
  was/VBD
  the/DT
  (NP match/NN)
  between/IN
  (NP KKR/NNP)
  &/CC
  (NP RCB/NNP)
  held/VBD
  on/IN
  (Date 8/04/2018/CD)
  ?/.)
Q: Where was the match between KKR & RCB held on 8/04/2018?
A: Kolkata

(S
  When/WRB
  was/VBD
  the/DT
  (NP match/NN)
  between/IN
  (NP RCB/NNP)
  &/CC
  (NP Kings/NNP XI/NNP)
  at/IN
  (NP Bengaluru/NNP)
  ?/.)
Q: When was the match between RCB & Kings XI at Bengaluru?
A: Apr 13, 2018

(S
  How/WRB
  (NP many/JJ runs/

TypeError: string indices must be integers