In [18]:
import pandas as pd

# Load JSON from a file or string
df = pd.read_csv('ind_vs_eng-2_test.json.csv')  # or use json.loads(string)
print(df[['content',]])

                                                content
0     OUT! IN THE AIR AND GONE! All over for England...
1     Short and on off, Bashir makes room and chops ...
2     Beaten! Slower through the air, fuller and out...
3     Pitched way outside leg, and much slower throu...
4     Much slower and the length is pulled back, on ...
...                                                 ...
2386  FOUR! India are up and running and so is Yasha...
2387  The in-between length now, bowled wide outside...
2388  Drags the length back and brings the line towa...
2389  Much fuller now, just around off, darting acro...
2390  Seam-up from Woakes, hitting the good length a...

[2391 rows x 1 columns]


In [19]:
ball_types = [
    "short", "shorter", "short of length", "bouncer",
    "full", "fuller", "full toss", "overpitched",
    "yorker", "good length", "back of a length"
]

shot_types = [
    "pull", "pulled", "drive", "drives", "cover drive", "cut", "cuts",
    "sweep", "swept", "flick", "flicked", "hook", "hooked",
    "loft", "lofted", "reverse sweep", "ramp"
]

regions = [
    "deep square leg", "fine leg", "third man", "long on", "long off",
    "cover", "extra cover", "point", "backward point", "mid wicket",
    "mid on", "mid off", "behind square", "cow corner"
]


In [22]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    tokens = word_tokenize(str(text).lower())
    lemmas = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(lemmas)


In [23]:
def match_phrases(text, phrases):
    matches = []
    sorted_phrases = sorted(phrases, key=len, reverse=True)  # match longer phrases first
    for p in sorted_phrases:
        if re.search(r"\b" + re.escape(p) + r"\b", text.lower()):
            matches.append(p)
    return matches


In [25]:
df['lemmatized'] = df['content'].apply(lemmatize_text)

# Extract features
df['ball_type'] = df['lemmatized'].apply(lambda x: match_phrases(x, ball_types))
df['shot_type'] = df['lemmatized'].apply(lambda x: match_phrases(x, shot_types))
df['region'] = df['lemmatized'].apply(lambda x: match_phrases(x, regions))

In [26]:
df.to_csv('commentary_analysis.csv', index=False)

In [28]:
df[['ball_type','shot_type','region','content',]]

Unnamed: 0,ball_type,shot_type,region,content
0,[],[],[cover],OUT! IN THE AIR AND GONE! All over for England...
1,[short],[],"[backward point, point]","Short and on off, Bashir makes room and chops ..."
2,[fuller],[sweep],[],"Beaten! Slower through the air, fuller and out..."
3,[],[],[],"Pitched way outside leg, and much slower throu..."
4,[],[pulled],[],"Much slower and the length is pulled back, on ..."
...,...,...,...,...
2386,[],[],[fine leg],FOUR! India are up and running and so is Yasha...
2387,[],[],[],"The in-between length now, bowled wide outside..."
2388,[],[],[],Drags the length back and brings the line towa...
2389,[fuller],[],[],"Much fuller now, just around off, darting acro..."


In [29]:
empty_rows = df[
    (df['ball_type'].apply(lambda x: len(x) == 0)) &
    (df['shot_type'].apply(lambda x: len(x) == 0)) &
    (df['region'].apply(lambda x: len(x) == 0))
]

In [31]:
new_csv = empty_rows[['content']]

In [32]:
new_csv.to_csv('comm_ana.csv')

In [33]:
shot_type_dict = {
    "pull": ["pull", "pulled"],
    "drive": ["drive", "drives", "driven"],
    "cut": ["cut", "cuts", "cutting"],
    "sweep": ["sweep", "sweeps", "swept"],
    "flick": ["flick", "flicked", "whip", "whips", "whipped", "nudge", "nudges", "nudged", "tuck", "tucks"],
    "loft": ["loft", "lofts", "lofted", "heaves", "hits in the air", "over the top"]
}

In [41]:
ball_types = [
    "short", "short of a length", "back of a length",
    "full", "fuller", "yorker", "good length", "length ball", "full toss"
]

regions = [
    "third man", "square leg", "deep square leg", "backward square leg",
    "cover", "extra cover", "point", "mid wicket", "long on", "long off"
]
def match_by_dict(text, phrase_dict):
    matches = []
    text = text.lower()
    for key, values in phrase_dict.items():
        for variant in values:
            if re.search(r"\b" + re.escape(variant) + r"\b", text):
                matches.append(key)
                break  # avoid duplicate keys
    return matches



In [42]:
empty_rows['shot_type'] = empty_rows['lemmatized'].apply(lambda x: match_by_dict(x, shot_type_dict))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  empty_rows['shot_type'] = empty_rows['lemmatized'].apply(lambda x: match_by_dict(x, shot_type_dict))


In [38]:
empty_rows

Unnamed: 0,id,match_id,content,bowler,batsman,over,ball,lemmatized,ball_type,shot_type,region
3,98150000000,128748,"Pitched way outside leg, and much slower throu...",Ravindra Jadeja,Shoaib Bashir,67,4,"pitched way outside leg , and much slower thro...",[],[],[]
6,98060000000,128748,"Just over the stumps! Turning away, landing ar...",Ravindra Jadeja,Shoaib Bashir,67,1,"just over the stump ! turning away , landing a...",[],[],[]
7,98030000000,128748,Spots Carse backing away and Akash bowls it we...,Akash Deep,Brydon Carse,66,6,spot carse backing away and akash bowl it well...,[],[],[]
9,97990000000,128748,"NO BALL AND FOUR! On a nagging length, outside...",Akash Deep,Brydon Carse,66,5,"no ball and four ! on a nagging length , outsi...",[],[],[]
17,97760000000,128748,Lobbed to first slip and taken! Given out but ...,Ravindra Jadeja,Shoaib Bashir,65,3,lobbed to first slip and taken ! given out but...,[],[],[]
...,...,...,...,...,...,...,...,...,...,...,...
2374,830000000,128748,"Pitched up, in the channel and seaming away, R...",Chris Woakes,KL Rahul,2,5,"pitched up , in the channel and seaming away ,...",[],[],[]
2375,800000000,128748,Hits the hard length and gets the ball to nip ...,Chris Woakes,KL Rahul,2,4,hit the hard length and get the ball to nip aw...,[],[],[]
2383,580000000,128748,Much better from Carse! This is pitched up on ...,Brydon Carse,KL Rahul,1,2,much better from carse ! this is pitched up on...,[],[],[]
2387,450000000,128748,"The in-between length now, bowled wide outside...",Chris Woakes,Yashasvi Jaiswal,0,4,"the in-between length now , bowled wide outsid...",[],[],[]
