In [5]:
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
import nltk


In [6]:
df = pd.read_csv('ab.csv')
df.head(4)

Unnamed: 0,No.,Score,Against,Position,Innings,Venue,Ground,Date,Result
0,1,109,England,1,3,"SuperSport Park, Centurion",Home,21-Jan-05,Drawn
1,2,178,West Indies,2,2,"Kensington Oval, Bridgetown",Away,21-Apr-05,Won
2,3,114,West Indies,1,1,"Antigua Recreation Ground, St John's",Away,29-Apr-05,Drawn
3,4,146,West Indies,1,1,"National Cricket Stadium, St George's",Away,10-Apr-07,Won


In [7]:
# tokenization

df = pd.DataFrame(df)

def tokenize_text(text):
    tokens = word_tokenize(text.lower())  # Tokenize and convert to lowercase
    return tokens


In [8]:
nltk.download('punkt')
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
df['tokenized_text'] = df['Against'].apply(tokenize_text)
print(df)

    No.  Score       Against  Position  Innings  \
0     1    109       England         1        3   
1     2    178   West Indies         2        2   
2     3    114   West Indies         1        1   
3     4    146   West Indies         1        1   
4     5    107      Zimbabwe         4        1   
5     6  103*       Pakistan         4        1   
6     7   103*   West Indies         6        2   
7     8  217*          India         6        2   
8     9    174       England         6        2   
9    10  106*      Australia         5        4   
10   11   104*     Australia         5        2   
11   12    163     Australia         5        2   
12   13    121       England         3        1   
13   14   114*         India         5        2   
14   15  102*          India         4        1   
15   16    102   West Indies         4        1   
16   17   135*   West Indies         5        1   
17   18   101*      Zimbabwe         3        2   
18   19    109      Zimbabwe   

In [10]:
# normalization

import unicodedata

def tokenize_and_normalize(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    tokens = word_tokenize(text)
    return tokens


df['normalized_tokenized_text'] = df['Against'].apply(tokenize_and_normalize)

print(df)

    No.  Score       Against  Position  Innings  \
0     1    109       England         1        3   
1     2    178   West Indies         2        2   
2     3    114   West Indies         1        1   
3     4    146   West Indies         1        1   
4     5    107      Zimbabwe         4        1   
5     6  103*       Pakistan         4        1   
6     7   103*   West Indies         6        2   
7     8  217*          India         6        2   
8     9    174       England         6        2   
9    10  106*      Australia         5        4   
10   11   104*     Australia         5        2   
11   12    163     Australia         5        2   
12   13    121       England         3        1   
13   14   114*         India         5        2   
14   15  102*          India         4        1   
15   16    102   West Indies         4        1   
16   17   135*   West Indies         5        1   
17   18   101*      Zimbabwe         3        2   
18   19    109      Zimbabwe   

In [11]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [12]:
# stopword removal

from nltk.corpus import stopwords

def tokenize_normalize_remove_stopwords(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    tokens = word_tokenize(text)


    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    return filtered_tokens

df['processed_text'] = df['Against'].apply(tokenize_normalize_remove_stopwords)

print(df)

    No.  Score       Against  Position  Innings  \
0     1    109       England         1        3   
1     2    178   West Indies         2        2   
2     3    114   West Indies         1        1   
3     4    146   West Indies         1        1   
4     5    107      Zimbabwe         4        1   
5     6  103*       Pakistan         4        1   
6     7   103*   West Indies         6        2   
7     8  217*          India         6        2   
8     9    174       England         6        2   
9    10  106*      Australia         5        4   
10   11   104*     Australia         5        2   
11   12    163     Australia         5        2   
12   13    121       England         3        1   
13   14   114*         India         5        2   
14   15  102*          India         4        1   
15   16    102   West Indies         4        1   
16   17   135*   West Indies         5        1   
17   18   101*      Zimbabwe         3        2   
18   19    109      Zimbabwe   

In [13]:
# stemming and lemmatization

from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [14]:
def process_text(text):
    text = text.lower()
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(word) for word in filtered_tokens]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in filtered_tokens]

    return stemmed_tokens, lemmatized_tokens

df['stemmed_text'], df['lemmatized_text'] = zip(*df['Against'].apply(process_text))

print(df)

    No.  Score       Against  Position  Innings  \
0     1    109       England         1        3   
1     2    178   West Indies         2        2   
2     3    114   West Indies         1        1   
3     4    146   West Indies         1        1   
4     5    107      Zimbabwe         4        1   
5     6  103*       Pakistan         4        1   
6     7   103*   West Indies         6        2   
7     8  217*          India         6        2   
8     9    174       England         6        2   
9    10  106*      Australia         5        4   
10   11   104*     Australia         5        2   
11   12    163     Australia         5        2   
12   13    121       England         3        1   
13   14   114*         India         5        2   
14   15  102*          India         4        1   
15   16    102   West Indies         4        1   
16   17   135*   West Indies         5        1   
17   18   101*      Zimbabwe         3        2   
18   19    109      Zimbabwe   

In [17]:
# word embeddings
!pip install gensim nltk



In [23]:
 !pip install sumy nltk


Collecting sumy
  Downloading sumy-0.11.0-py2.py3-none-any.whl (97 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/97.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m92.2/97.3 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.3/97.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Collecting docopt<0.7,>=0.6.1 (from sumy)
  Downloading docopt-0.6.2.tar.gz (25 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting breadability>=0.1.20 (from sumy)
  Downloading breadability-0.1.20.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pycountry>=18.2.23 (from sumy)
  Downloading pycountry-23.12.11-py3-none-any.whl (6.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: breadability, 

In [25]:
import pandas as pd
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer
import unicodedata


In [27]:
df = pd.DataFrame(df)

In [28]:
def normalize_text(text):

    text = text.lower()

    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    return text


df['normalized_text'] = df['Against'].apply(normalize_text)

In [29]:
def summarize_text(text):
    parser = PlaintextParser.from_string(text, Tokenizer('english'))
    summarizer = TextRankSummarizer()
    summary = summarizer(parser.document, sentences_count=2)  # Change sentences_count as needed
    return ' '.join(str(sentence) for sentence in summary)

df['summary'] = df['normalized_text'].apply(summarize_text)

print(df[['Against', 'summary']])

         Against      summary
0        England      england
1    West Indies  west indies
2    West Indies  west indies
3    West Indies  west indies
4       Zimbabwe     zimbabwe
5       Pakistan     pakistan
6    West Indies  west indies
7          India        india
8        England      england
9      Australia    australia
10     Australia    australia
11     Australia    australia
12       England      england
13         India        india
14         India        india
15   West Indies  west indies
16   West Indies  west indies
17      Zimbabwe     zimbabwe
18      Zimbabwe     zimbabwe
19      Pakistan     pakistan
20         India        india
21   West Indies  west indies
22   Netherlands  netherlands
23     Sri Lanka    sri lanka
24     Sri Lanka    sri lanka
25   New Zealand  new zealand
26     Australia    australia
27      Pakistan     pakistan
28      Pakistan     pakistan
29      Pakistan     pakistan
30      Pakistan     pakistan
31      Pakistan     pakistan
32        