# Importing  Libraries

In [None]:
import os
import unicodedata
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import re
import spacy
import nltk
import string
from spacy.lang.en.stop_words import STOP_WORDS

# Settings configuration

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
comm_filepath = os.path.abspath(os.path.join(os.getcwd(), "../../../data/CommExtract/"))
pitch_score_filepath = os.path.abspath(os.path.join(os.getcwd(), "../../../config/pitchAI/score/"))

# Initializing parameters

In [None]:
comm_file = '\\comm_69561.txt'
nlp = spacy.load('en_core_web_sm')

# Load the commentary & scoring files

In [None]:
def remove_special_characters(text):
    # Normalize the text to remove special characters and normalize spaces
    normalized_text = unicodedata.normalize('NFKD', text)
    # Remove special characters and control characters
    cleaned_text = ''.join(c for c in normalized_text if not unicodedata.category(c).startswith('C'))
    return cleaned_text.strip()

# Load the list from the text file
comm0 = []
with open(comm_filepath + comm_file, 'r') as f:
    for line in f:
        cleaned_line = remove_special_characters(line)
        comm0.append(cleaned_line)
        
df_RS = pd.read_excel(pitch_score_filepath + '\\relevance_score.xlsx')
df_KS = pd.read_excel(pitch_score_filepath + '\\keyword_score.xlsx')
df_Comp = pd.read_excel(pitch_score_filepath + '\\comparative_score.xlsx')
df_Neg = pd.read_excel(pitch_score_filepath + '\\negative_score.xlsx')
#df['Word'] = df['Word'].apply(lambda x : nlp(x)[0].lemma_.lower())
df_RS['Word'] = df_RS['Word'].apply(lambda x : x.lower() if isinstance(x, str) else x)
df_KS['Word'] = df_KS['Word'].apply(lambda x : x.lower() if isinstance(x, str) else x)
df_KS['Helper'] = df_KS['Helper'].apply(lambda x: x.lower() if isinstance(x, str) else x)
df_Comp['Word'] = df_Comp['Word'].apply(lambda x : x.lower() if isinstance(x, str) else x)
df_Neg['Word'] = df_Neg['Word'].apply(lambda x : x.lower() if isinstance(x, str) else x)
df_Score = pd.DataFrame({'Keyword': [], 'Score': [], 'Bat': [], 'Pace': [], 'Spin': []})

df_RS.head()

# Filter 1 : Extract only pre-match commentary

In [None]:
ball_num = ['0.1', '0.2', '0.3', '0.4', '0.5', '0.6', '1.1','1.2','1.3','1.4','1.5','1.6']
max_ind = 0
for i in range(len(comm0)-1,-1,-1 ):
    if comm0[i] in ball_num:
        if i > max_ind:
            max_ind = i
print(max_ind)
comm = comm0[max_ind+1:]

# Filter 2 : Filter out non alpha-numeric charecter sentences

In [None]:
text_comm = [element for element in comm if any(char.isalpha() for char in element)]

print('the original comm file contains - ' + str(len(comm)) + ' lines of text')
print('after filtering for alpha-numeric texts, the file contains - ' + str(len(text_comm)) + ' lines of text')

# Function : POS Tagging

In [None]:
def func_pos_tag(arr):
    pos_tag = []
    for text in arr:
        doc = nlp(text)
        pos_count = {}
        for token in doc:
            pos = token.pos_
            if pos in pos_count:
                pos_count[pos] += 1
            else:
                pos_count[pos] = 1
        pos_tag.append(pos_count)
    return pos_tag

# Filter 3 : Filter out squad, playing XI etc.

In [None]:
pos_tag = func_pos_tag(text_comm)

text_comm2 = []
pos_tag2 = []

for i, text in enumerate(text_comm):
    pos_count = pos_tag[i]
    noun_count = pos_count.get('PROPN', 0)
    punctuation_count = pos_count.get('PUNCT', 0)
    total_count = sum(pos_count.values())

    if (noun_count + punctuation_count) / total_count <= 0.6:
        text_comm2.append(text)
        #pos_tag2.append(pos_tag[i])
#text_comm2

# Filter 4 : Filter relevant texts based on average length

In [None]:
def func_find_threshold(num_words):
    # Sort the num_words list in descending order
    sorted_num_words = sorted(num_words, reverse=True)
    
    # Define the sample size as a percentage of the total data points
    sample_size_percent = 0.4  # Change this value as needed
    
    # Calculate the number of textlines to consider based on the sample size percentage
    num_records = int(len(sorted_num_words) * sample_size_percent)
    
    # Calculate the avg_topn as the average of the top n records
    #n = int(sample_size_percent * 100)  # Change this value as needed
    avg_topn = sum(sorted_num_words[:num_records]) / num_records
    
    # Create a new list to store data points above the threshold
    above_threshold = [num for num in sorted_num_words if num >= avg_topn * 0.7]
    
    # Calculate the average of the above_threshold list
    average_above_threshold = sum(above_threshold) / len(above_threshold)
    
    print("Average of the top", int(sample_size_percent * 100) , "% records:", avg_topn, "no. of words/textline")
    print("Word counts of textlines", int(sample_size_percent * 100), "% or above the avg:", above_threshold)
    print("Word count average of the modified file now is :", average_above_threshold, " words/textline")
    
    return min(above_threshold)

# Calculate the number of words in each text line
num_words = [len(text.split()) for text in text_comm2]
threshold = func_find_threshold(num_words)
text_comm3 = [element for element in text_comm2 if len(element.split()) >= threshold]
#text_comm3

## Tokenize paragraph to sentences

In [None]:
line_comm = []
for paragraph in text_comm3:
    sentences = nltk.tokenize.sent_tokenize(paragraph)
    line_comm.append(sentences)    

## Perform Lemmatization

In [None]:
# Create a copy of line_comm with stop words removed and lemmatization applied
lemmatized_comm = []
for sentences in line_comm:
    lemmatized_sentences = []
    for sentence in sentences:        
        doc = nlp(sentence)
        lemmatized_tokens = [token.lemma_.lower() for token in doc if not token.is_stop]        
        lemmatized_tokens = [word.strip(string.punctuation) for word in lemmatized_tokens if word.strip(string.punctuation)]
        lemmatized_sentences.append(" ".join(lemmatized_tokens)) if lemmatized_tokens else None
    lemmatized_comm.append(lemmatized_sentences)
#lemmatized_comm

## Extracting relevant text - Relevance Scoring

In [None]:
def calculate_score(text):
    words = text.split()  # Split the text into individual words
    matched_words = []  # Initialize a list to store matched words or combinations
    scores = []  # Initialize a list to store scores for matched words or combinations
    
    # Calculate scores and track matched words for unigrams
    for i in range(len(words)):
        unigram = set([words[i]])

        for index,row in df_RS[(df_RS['Usage'] == 'AddScore') & (df_RS['Sequence'] == 'Unigram')].iterrows():
            if unigram == set(row['Word'].split()):
                score = row['Score']
                scores.append(score)
                matched_words.append(unigram)
                
    # Calculate scores and track matched words for bigrams
    for i in range(len(words) - 1):
        bigram = set([words[i], words[i+1]])

        for index,row in df_RS[(df_RS['Usage'] == 'AddScore') & (df_RS['Sequence'] == 'Bigram')].iterrows():
            if bigram == set(row['Word'].split()):
                score = row['Score'] * 1.5
                scores.append(score)

    # Calculate scores and track matched words for trigrams
    for i in range(len(words) - 2):
        bigram = set([words[i], words[i+1], words[i+2]])

        for index,row in df_RS[(df_RS['Usage'] == 'AddScore') & (df_RS['Sequence'] == 'Trigram')].iterrows():
            if bigram == set(row['Word'].split()):
                score = row['Score'] * 2
                scores.append(score)
                
    return sum(scores), matched_words

# Calculate the total score, matched words, and average score for each element in lemmatized_comm
score_list = []
matched_words_list = []
avg_score_list = []
score_list_2d = []  # Initialize a list to store 2D scores for each line

for para in lemmatized_comm:
    para_score = 0
    para_matched_words = []
    para_word_count = 0
    para_score_list = []

    for line in para:
        score, matched_words = calculate_score(line)
        para_score += score
        para_word_count += len(line.split())
        para_matched_words.extend(matched_words)
        para_score_list.append(score)

    score_list.append(para_score)
    matched_words_list.append(para_matched_words)
    avg_score_list.append(round((para_score / para_word_count), 3))
    score_list_2d.append(para_score_list)

print("Total Scores:", score_list, sep = '\n', end = '\n\n')
print("Matched Words:", matched_words_list, sep = '\n', end = '\n\n')
print("Average Scores:", avg_score_list, sep = '\n', end = '\n\n')
print("2D Scores:", score_list_2d, sep = '\n', end = '\n\n')

In [None]:
line_comm2 = []
max_avg = max(avg_score_list)
print("Cutoff - ", max_avg * 0.35)
for i in range(len(avg_score_list)):
    if avg_score_list[i] >= 0.35 * max_avg:
        for j in range(len(score_list_2d[i])):
            if score_list_2d[i][j] != 0:
                sentences = re.split(r'(?<=[.,!?])\s', line_comm[i][j])
                line_comm2.extend(sentences)
                #line_comm2.append(line_comm[i][j])
# for i in line_comm2:
#     print(i)
line_comm2

# Key-Word Scoring

### 1. Lemmatized data with negative and comparatives

In [None]:
lemmatized_comm2 = []
comp_list = list(df_Comp['Word'])
neg_list = list(df_Neg['Word'])
for sentence in line_comm2:
# for sentence in ["There is not a lot of dew in this surface to be expected in today's game.",
#                  "Grass is nowhere to be seen. Pitch is flat, Huge total to be expected here today",
#                 "This is a high scoring venue"]:
    doc = nlp(sentence)
    lemmatized_tokens = [token.lemma_.lower() for token in doc if not token.is_stop 
                         or token.lemma_.lower() in neg_list 
                         or token.lemma_.lower() in comp_list]

    lemmatized_tokens = [word.strip(string.punctuation) for word in lemmatized_tokens if word.strip(string.punctuation)]
    lemmatized_comm2.append(" ".join(lemmatized_tokens)) if lemmatized_tokens else None
lemmatized_comm2

In [None]:
KS_word_list = list(df_KS['Word'].unique())
KS_helper_list = list(df_KS['Helper'].unique())
comp_list = list(df_Comp['Word'].unique())
neg_list = list(df_Neg['Word'].unique())
col_list = ['keyword','score','bat','pace','spin','negative_flag']
dtype_dict = {'keyword': 'object', 
          'score': 'float64', 
          'bat': 'float64', 
          'pace': 'float64', 
          'spin':'float64', 
          'negative_flag': 'bool'}
df_KWScore = pd.DataFrame(columns = col_list).astype(dtype_dict)

In [None]:
keyword_chunks = []
for line in lemmatized_comm2:
    #print("the line is - ", line)
    
    line_words = line.split()
    for i, word in enumerate(line_words):
        if (word in KS_word_list):
            keyword = [word]
            comp_ind_list = []
            neg_ind_list = []
            helper_found = False
            left_index, right_index = i,i
            left_index_found, right_index_found = False, False
            neg_in_keyword = False
            
            j = i-1            
            while (j>=0 and not left_index_found):                
                if ((df_KS['Word'] == word) & (df_KS['Helper'] == line_words[j])).any() and not helper_found:
                    helper_found = True
                    left_index -= 1
                elif (line_words[j] in comp_list):
                    left_index -= 1
                elif (line_words[j] in neg_list):
                    left_index -= 1
                    neg_in_keyword = not neg_in_keyword
                else:
                    left_index_found = True
                j -= 1

            j = i+1
            while (j < len(line_words) and not right_index_found):
                if (((df_KS['Word'] == word) & (df_KS['Helper'] == line_words[j])).any() and not helper_found):
                    helper_found = True
                    right_index += 1
                elif (line_words[j] in comp_list):
                    right_index += 1
                elif (line_words[j] in neg_list):
                    right_index += 1
                    neg_in_keyword = not neg_in_keyword
                else:
                    right_index_found = True 
                j += 1
            keyword_chunks.append([word, line_words[left_index:right_index+1]])
keyword_chunks

In [None]:
def calculate_category_score(score, index):
    # Get the row at the specified index
    row = df_KS.iloc[index]
    nan = float('nan')
    # Initialize scores for each category as blank by default
    scores = {'Bat': nan, 'Pace': nan, 'Spin': nan}

    # Define column weights
    weights = {'TypeBenifit1': 1, 'TypeBenifit2': 0.7, 'TypeDeficit1': -1, 'TypeDeficit2': -0.7}

    # Iterate over column names and their corresponding weights
    for col_name, weight in weights.items():
        if not pd.isna(row[col_name]):
            categories = row[col_name].split(',')
            for category in categories:
                category = category.strip()  # Remove leading/trailing spaces
                if category in scores:
                    scores[category] = weight * score

    return scores['Bat'], scores['Pace'], scores['Spin']

In [None]:
def calculate_keyword_score(keyword = None, arr = []):
    

    chunk = arr
    #Step 1 : find index of keyword & helper
    helper = -1
    
    #print(keyword, chunk)
    
    for word in chunk:
        if keyword == word:
            continue
        if (word in KS_helper_list) & (df_KS[((df_KS['Word'] == keyword) & (df_KS['Helper'] == word))]['Score'].any()):
            KW_Score = float(df_KS[((df_KS['Word'] == keyword) & (df_KS['Helper'] == word))]['Score'])
            KS_ind = df_KS.index[(df_KS['Word'] == keyword) & (df_KS['Helper'] == word)].tolist()
            helper = word
            #chunk.remove(word)
    if helper == -1:
        KW_Score = float(df_KS[((df_KS['Word'] == keyword) & (df_KS['Helper'] == -1))]['Score'])
        KS_ind = df_KS.index[(df_KS['Word'] == keyword) & (df_KS['Helper'] == -1)].tolist()
        
        
    #Step 2 : find all comparatives
    Comp_Score = 0.0
    Comp_Score_Mul = 1.0
    for word in chunk:
        if (keyword == word) or (helper == word):
            continue            
        if word in comp_list:
            if ((df_Comp['Word'] == word) & (df_Comp['Type'] == 'Add')).any(): # Adding Comp Score
                Comp_Score = float(df_Comp[df_Comp['Word'] == word]['Score'])
                #chunk.remove(word)
                
            if ((df_Comp['Word'] == word) & (df_Comp['Type'] == 'Mul')).any(): # Multiplying Comp Score
                Comp_Score_Mul *= float(df_Comp[df_Comp['Word'] == word]['Score'])
                #chunk.remove(word)
                
    #Step 3 : find all negatives
    neg_state = False
    for word in chunk:
        if (keyword == word) or (helper == word):
            continue
        if word in neg_list:
            neg_state = not neg_state
            
    #Step 4 : Find total score
    Total_Score = 0 if neg_state else (KW_Score + Comp_Score) * Comp_Score_Mul
    bat_score, pace_score, spin_score = calculate_category_score(Total_Score, KS_ind[0])
    diff_score = [bat_score, pace_score, spin_score]
    
    return Total_Score, diff_score

In [None]:
df_Score = pd.DataFrame({'Keyword': [], 'Score': [], 'Bat': [], 'Pace': [], 'Spin': []})

def custom_sort_key(row):
    return (
        abs(row['Score']),
        abs(row['Bat']),
        abs(row['Pace']),
        abs(row['Spin'])
    )

for obj in keyword_chunks:
    keyword = ' '.join(obj[1])
    #print(obj[0],obj[1])
    score, diff_score = calculate_keyword_score(obj[0], obj[1])
    #print(x, '|', score, '|', diff_score)
    if score != 0.0:
        df_Score.loc[len(df_Score)] = [keyword, score, diff_score[0], diff_score[1], diff_score[2]]   
        

df_Score['Rank'] = df_Score.groupby('Keyword', group_keys=False).apply(
    lambda x: x.apply(custom_sort_key, axis=1).rank(method='dense', ascending=False))
df_Score.drop(df_Score[df_Score['Rank'] != 1].index, inplace=True)
df_Score.drop(columns=['Rank'], inplace=True)

Score_count = [df_Score['Bat'].count(),
               df_Score['Pace'].count(),
               df_Score['Spin'].count()]
Score_mean = [df_Score['Bat'].mean(),
              df_Score['Pace'].mean(),
              df_Score['Spin'].mean(),]

#------------------Type - 1 --------------------
# Score_rating = [
#     float("{:.2f}".format(float(Score_mean[0] * 5 / Score_count[0]))) if Score_count[0] > 3 else nan,
#     float("{:.2f}".format(float(Score_mean[1] * 5 / Score_count[1]))) if Score_count[1] > 3 else nan,
#     float("{:.2f}".format(float(Score_mean[2] * 5 / Score_count[2]))) if Score_count[2] > 3 else nan]

# Actual_Score_rating = [float("{:.2f}".format(float(Score_mean[0] * 5 / Score_count[0]))),
#                        float("{:.2f}".format(float(Score_mean[1] * 5 / Score_count[1]))),
#                        float("{:.2f}".format(float(Score_mean[2] * 5 / Score_count[2])))]
#----------------------------------------------

Score_rating = [
    float("{:.2f}".format(float(Score_mean[0] * 5 / 3 ))) if Score_count[0] > 3 else nan,
    float("{:.2f}".format(float(Score_mean[1] * 5 / 3 ))) if Score_count[1] > 3 else nan,
    float("{:.2f}".format(float(Score_mean[2] * 5 / 3 ))) if Score_count[2] > 3 else nan,
]

Actual_Score_rating = [float("{:.2f}".format(float(Score_mean[0] * 5 / 3 ))),
                       float("{:.2f}".format(float(Score_mean[1] * 5 / 3 ))),
                       float("{:.2f}".format(float(Score_mean[2] * 5 / 3 ))),]

print(Score_rating, sep = '\n')
print(Actual_Score_rating, sep = '\n')
df_Score.head(100)

In [None]:
descriptions = []
ranges = [
    (-np.inf, -6, "Nightmare"),
    (-6, -4, "Challenging"),
    (-4, -2, "Slightly disadvantageous"),
    (-2, 2, "Balanced"),
    (2, 4, "Slightly favourable"),
    (4, 6, "Highly favourable"),
    (6, np.inf, "Paradise")]

for score in Score_rating:
    description = "Unknown"
    for lower, upper, desc in ranges:
        if lower <= score <= upper:
            description = desc
            break
    descriptions.append("Insufficient Data") if np.isnan(score) else descriptions.append(description)
print(descriptions)

In [None]:
df_Score.describe()

In [None]:
ranges = [
    (-np.inf, -3.75, "Nightmare"),
    (-3.75, -2.5, "Challenging"),
    (-2.5, -1.25, "Slightly disadvantageous"),
    (-1.25, 1.25, "Balanced"),
    (1.25, 2.5, "Slightly favourable"),
    (2.5, 3.75, "Highly favourable"),
    (3.75, np.inf, "Paradise"),
    (np.nan, np.nan, "Insufficient data")]

In [None]:
for line in final_comm:
    doc = nlp(line)
    for token in doc:
        print([token, token.pos_, spacy.explain(token.pos_)])
    print(end = '\n')
    
"""
little dry
want win
give chance
need play

"""

In [None]:
word = "bounce"

text1 = """It is a little dry as well."""
text2 = """Winning gives us a chance"""

doc = nlp(text1)
for token in doc:
    print([token, token.pos_, spacy.explain(token.pos_)])
    
# doc = nlp(text2)
# for token in doc:
#     if token.text == word:
#         print([token, token.pos_])

In [None]:
doc = nlp("fewer")
for token in doc:
    print(token.lemma_.lower())

for token in doc:
    print(token.is_stop)

In [None]:
STOP_WORDS

In [None]:
text = "This one though looks like a beautiful batting wicket."
doc = nlp(text)

for token in doc:
    print(token.lemma_)

In [None]:
doc = nlp("batting")
for token in doc:
    print(token.lemma_.lower())

In [None]:
line_comm2

In [None]:
for line in line_comm2:
    doc = nlp(line)
    #doc = nlp("He ought to improve his footwork if he wants to become a better batsman")

    print(doc,end = '\n')
    #doc = nlp("The pitch will behave differently run in second innings")
    for i, token in enumerate(doc):
        if token.tag_ in ("VB"):
            continue
        elif i >= 0 and token.pos_ == "VERB" and doc[i-1].pos_ == "AUX":
            continue
        elif i >= 0 and token.pos_ in ("VERB", "AUX") and doc[i-1].pos_ == "ADV" and doc[i-2].pos_ == "AUX":
            continue  
        elif token.pos_ in ("VERB", "AUX"):
            print([token, token.pos_, spacy.glossary.GLOSSARY[token.tag_], token.tag_])
        
                      
# get list of all modal and auxillary verbs from nltk and get the desired tense

In [None]:
text_comm4 = []
lemmatized_comm_para_filtered = []
lemmatized_comm_line_filtered = []
final_comm = []
max_avg = max(avg_score_list)
for i in range(len(avg_score_list)):
    if avg_score_list[i] >= 0.35 * max_avg:
        text_comm4.append(text_comm3[i])
        lemmatized_comm_para_filtered.append(lemmatized_comm[i])
        
        for j in range(len(score_list_2d[i])):
            if score_list_2d[i][j] != 0: #columns
                lemmatized_comm_line_filtered.append(lemmatized_comm[i][j])
                final_comm.append(line_comm[i][j])

In [None]:
max_avg * 0.5

In [None]:
text_comm4

In [None]:
plt.figure(figsize = (5,2.5))
sns.histplot(score_list, kde = True)
plt.show()

plt.figure(figsize = (5,2.5))
sns.histplot(avg_score_list, kde = True)
plt.show()

In [None]:
import unicodedata

def remove_special_characters(text):
    # Normalize the text to remove special characters and normalize spaces
    normalized_text = unicodedata.normalize('NFKD', text)
    # Remove special characters and control characters
    cleaned_text = ''.join(c for c in normalized_text if not unicodedata.category(c).startswith('C'))
    return cleaned_text.strip()


comm = []
with open(FILENAME, 'r') as f:
    for line in f:
        cleaned_line = remove_special_characters(line)
        comm.append(cleaned_line)

# Now 'comm' contains the lines with all special characters removed


In [None]:
text = """
68m on one side and 63m on the other square boundary, the straight boundary is at 78m. Spinners would be licking their lips in the dressing room. It's dry and will definitely turn. There are patches which will offer plenty to the quality\xa0spinners in both sides. It's dry and should become slow as the match progresses. 170 could be a match-winning total. Bat first, runs on the board in a final will matter, reckons
"""
unicodedata.normalize('NFKD', text)

In [None]:
spacy.explain("AUX")

In [None]:
spacy.POS

In [None]:
plt.figure(figsize=(10, 5))

# Calculate the number of words in each text line
num_words = [len(text.split()) for text in text_comm]

# Plotting the distribution of the number of words
sns.histplot(num_words, kde=True)
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Original Distribution of Number of Words in each text line')
plt.show()

plt.figure(figsize=(10, 5))

# Calculate the average number of alphabetical characters in each word
#avg_alphabet_chars = [sum(len(word) for word in text.split() if word.isalpha()) / len(text.split()) for text in filter_comm]
avg_alphabet_chars = [sum(len(re.findall('[a-zA-Z]', word)) for word in textline.split()) / len(textline.split()) for textline in text_comm2]

# Plotting the distribution of the average number of alphabetical characters
sns.histplot(avg_alphabet_chars, kde=True)
plt.xlabel('Average Number of Alphabetical Characters')
plt.ylabel('Frequency')
plt.title('Distribution of Average Number of Alphabetical Characters in Words')
plt.show()

In [None]:
sorted_num_words = sorted(num_words, reverse=True)

num_records = int(len(num_words) * 0.5)
#num_records #56

avg_topn = sum(sorted_num_words[:num_records]) / num_records
avg_topn

In [None]:
avg_alphabet_chars

In [None]:
text_comm2[-2]

In [None]:
plt.figure(figsize=(10, 5))

# Calculate the number of words in each text line
num_words = [len(text.split()) for text in text_comm]

# Plotting the distribution of the number of words
sns.histplot(num_words, kde=True)
plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Original Distribution of Number of Words in each text line')
plt.show()

In [None]:
plt.figure(figsize=(15, 5))

# Calculate the number of words in each text line
num_words = [len(text.split()) for text in text_comm]

# Plotting the distribution of the number of words
sns.histplot(num_words, kde=True)

# Calculate the cumulative distribution function (CDF)
values, base = np.histogram(num_words, bins='auto')
cumulative = np.cumsum(values) / sum(values)

# Find the index where the CDF crosses the specified threshold
x = 0.1
idx = np.argmax(cumulative >= x)

# Mark the threshold line on the histogram
plt.axvline(x=base[idx + 1], color='red', linestyle='--', label=f'{x * 100}% Area')

plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Original Distribution of Number of Words in each text line')
plt.legend()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

plt.figure(figsize=(10, 5))

# Calculate the number of words in each text line
num_words = [len(text.split()) for text in text_comm]

# Plotting the distribution of the number of words
sns.histplot(num_words, kde=True)

# Calculate the cumulative distribution function (CDF)
values, bins, _ = plt.hist(num_words, bins='auto', density=True, cumulative=True)
cumulative = values * np.diff(bins)

# Find the value where the cumulative probability crosses the specified threshold
x = 0.7
idx = np.argmax(cumulative >= x)

# Get the corresponding value from the bins array
threshold_value = bins[idx]

plt.xlabel('Number of Words')
plt.ylabel('Frequency')
plt.title('Original Distribution of Number of Words in each text line')
plt.show()

print(f"The value that creates {x * 100}% of the area is: {threshold_value}")


In [None]:
import numpy as np

# Define your array of numbers
data = np.array([0,1, 2, 3, 4, 5, 6, 7, 8, 9, 10,11,12,13,14,15,16,17,18,19,20])

# Create the histogram
hist, bins = np.histogram(data, bins='auto', density=True)

# Compute the cumulative distribution function (CDF)
cdf = np.cumsum(hist * np.diff(bins))

# Find the value where the area under the curve becomes 70%
target_area = 0.3
index = np.searchsorted(cdf, target_area)

# Get the corresponding value from the bins array
value = bins[index]

# Print the result
print(f"The value where the area under the histogram curve becomes 70% is {value}.")


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Data array
data_array = np.arange(20, -1, -1)

# Calculate the area of the original triangle
original_triangle_area = 0.5 * 20 * 20

# User-inputted area
#area = float(input("Enter the area (as a fraction of the original triangle area): "))
area = 0.8

# Calculate the target area for the small triangle
target_area = area * original_triangle_area

# Calculate the cumulative sum of the data array
cumulative_sum = np.cumsum(data_array)

# Find the index where the cumulative sum crosses the target area
index = np.where(cumulative_sum >= target_area)[0][0]

# Calculate the remaining area needed
remaining_area = target_area - cumulative_sum[index-1]

# Calculate the fraction of remaining area in relation to the data point difference
fraction = remaining_area / data_array[index]

# Calculate the value of x
x = index + fraction

# Plotting the triangle
plt.plot(data_array, 'b', linewidth=2)
plt.fill_between(range(index+1), data_array[:index+1], color='blue', alpha=0.5)
plt.xlabel("Index")
plt.ylabel("Value")
plt.title("Triangle")
plt.grid(True)
plt.show()

# Print the value of x
print("The value of x is:", x)


In [None]:
arr = [len(text.split()) for text in text_comm]
area = 0.8

# Creating the histogram and KDE plot using seaborn
sns.histplot(arr, kde=True, stat='density', color='blue')
plt.xlabel("Value")
plt.ylabel("Density")
plt.title("Histogram and KDE")
plt.grid(True)

# Calculate the cumulative distribution function (CDF)
n, bins = np.histogram(arr, bins='auto', density=True)
cdf = np.cumsum(n * np.diff(bins))

x = np.interp(area, cdf, bins[:-1])

plt.axvline(x=x, color='red', linestyle='--', label=f'x = {x:.2f}')

print("The value of x is:", x)

plt.show()


In [None]:
comm2

In [None]:
text_comm

# POS analysis and data cleaning

In [None]:
# import spacy

# nlp = spacy.load('en_core_web_sm')

# pos_tags = []
# text = "After 70 league games spread over a period of a littleover 7 weeks, we finally have four survivors contesting to take home the prized IPL trophy. Will MI be 6th time champs? Can CSK grab a 5th title? Will Gujarat Titans continue their winning spree and defend their title or will Lucknow Super Giantsbecome a first time champions? Lot to play for as IPL 2023 reaches the home stretch with today's Qualifier 1. The Titans cruised through the league phase once again and were comfortably the best team on display, they just didn't confirm their place in the playoffs, but they did so emphatically. On the other hand, the Super Kings had to wait until their final game of the league, but they do have the knack of winning big moments, especially when playing at home. The winner of tonight's game will go through to Sunday's final while the loser will wait for tomorrow's winner in QF 2 on 26th in Ahmedabad. CSK not just have home advantage tonight, but they're the better rested team, having played their last league match on Saturday. On the other hand, Titans's last match spilled into the wee hours of yesterday and they'd to travel as well. Would that be a telling factor today?"

# doc = nlp(text)
# tags = [(token.text, token.pos_) for token in doc]
# tags

In [None]:
import spacy

nlp = spacy.load('en_core_web_sm')

pos_counts = []

for text in text_comm2:
    doc = nlp(text)
    pos_count = {}

    for token in doc:
        pos = token.pos_
        if pos in pos_count:
            pos_count[pos] += 1
        else:
            pos_count[pos] = 1

    pos_counts.append(pos_count)
# pos_counts

In [None]:
# Extract unique POS tags
unique_pos_tags = sorted(set([pos for pos_count in pos_counts for pos in pos_count.keys()]))

# Create an array to store the counts for each POS tag
pos_counts_array = np.zeros((len(pos_counts), len(unique_pos_tags)))

# Fill in the array with the counts
for i, pos_count in enumerate(pos_counts):
    for j, pos_tag in enumerate(unique_pos_tags):
        pos_counts_array[i, j] = pos_count.get(pos_tag, 0)

# Plotting the stacked bar chart
plt.figure(figsize=(12, 8))
plt.bar(range(len(pos_counts)), pos_counts_array.T[0], label=unique_pos_tags[0])

for i in range(1, len(unique_pos_tags)):
    plt.bar(range(len(pos_counts)), pos_counts_array.T[i], bottom=np.sum(pos_counts_array.T[:i], axis=0),
            label=unique_pos_tags[i])

plt.xlabel('Line of Text')
plt.ylabel('Count')
plt.title('POS Tag Distribution for Each Line of Text')
plt.xticks(range(len(pos_counts)), range(1, len(pos_counts) + 1))
#plt.xticks(range(60,73), range(61,74))
plt.legend(loc='upper right')
plt.show()

In [None]:
text_comm3 = []

for i, text in enumerate(text_comm2):
    pos_count = pos_counts[i]
    noun_count = pos_count.get('PROPN', 0)
    punctuation_count = pos_count.get('PUNCT', 0)
    total_count = sum(pos_count.values())

    if (noun_count + punctuation_count) / total_count <= 0.6:
        text_comm3.append(text)
text_comm3

# Calculating word frequency

In [None]:
nlp = spacy.load('en_core_web_sm')

word_counts = []
for i in range(len(text_comm3)):
    doc = nlp(text_comm3[i])
    # Calculate word frequencies, excluding stop words
    word_frequencies = {}
    for token in doc:
        word = token.text.lower()
        if word.isalpha() and word not in STOP_WORDS:
            if word not in word_frequencies:
                word_frequencies[word] = 1
            else:
                word_frequencies[word] += 1

    all_word_frequencies = [(word, freq) for word, freq in word_frequencies.items()]
    #print('received values for line' + str(i))
    word_counts.append(all_word_frequencies)
word_counts

In [None]:
text_comm3[-11]

In [None]:
text_comm2[60:65]

In [None]:
num_words = [len(text.split()) for text in text_comm]
num_words

In [None]:
import numpy as np

def calculate_value_for_area(data, area_percentage):
    sorted_data = np.sort(data)
    total_points = len(sorted_data)
    rank = int((total_points * area_percentage) / 100)
    
    if rank == 0:
        return sorted_data[0]
    elif rank >= total_points:
        return sorted_data[-1]
    else:
        rank_floor = int(np.floor(rank))
        rank_decimal = rank - rank_floor
        value = sorted_data[rank_floor - 1] + (rank_decimal * (sorted_data[rank_floor] - sorted_data[rank_floor - 1]))
        return value

In [None]:
area = 99.999999999999999999999999999999999999999
value = calculate_value_for_area(num_words, area)
print(f"The value for {area}% area below the curve is: {value}")

In [None]:
text_comm3