In [1]:
import numpy as np
import pandas as pd
import textwrap

import nltk
from nltk import word_tokenize
from nltk.tokenize.treebank import TreebankWordDetokenizer

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Saeed\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Loading DataSet

In [3]:
df = pd.read_csv('bbc_text_cls.csv')

In [4]:
df.head()

Unnamed: 0,text,labels
0,Ad sales boost Time Warner profit\n\nQuarterly...,business
1,Dollar gains on Greenspan speech\n\nThe dollar...,business
2,Yukos unit buyer faces loan claim\n\nThe owner...,business
3,High fuel prices hit BA's profits\n\nBritish A...,business
4,Pernod takeover talk lifts Domecq\n\nShares in...,business


In [5]:
labels = set(df['labels'])

labels

{'business', 'entertainment', 'politics', 'sport', 'tech'}

# Pick a Label Whose Data We Want To Train From

In [6]:
label = 'business'

In [7]:
texts = df[df['labels'] == label]['text']

texts.head()

0    Ad sales boost Time Warner profit\n\nQuarterly...
1    Dollar gains on Greenspan speech\n\nThe dollar...
2    Yukos unit buyer faces loan claim\n\nThe owner...
3    High fuel prices hit BA's profits\n\nBritish A...
4    Pernod takeover talk lifts Domecq\n\nShares in...
Name: text, dtype: object

# Calculate Counts of Each Word

In [8]:
# Initialize a dictionary to store probabilities
transition_probs = {}  # key: (previous_word, next_word), values: {current_word: counts(current_word)}

# Iterate through the documents in the 'texts' list
for document in texts:
    # Split the document into lines
    lines = document.split('\n')
    
    # Iterate through the lines in the document
    for line in lines:
        # Tokenize the line into words
        tokens = word_tokenize(line)
        
        # Iterate through the tokens, excluding the last two words
        for i in range(len(tokens) - 2):
            # Extract current, next, and next-next words
            current_word = tokens[i]
            next_word = tokens[i + 1]
            next_next_word = tokens[i + 2]
            
            # Create a key for the transition probabilities dictionary
            transition_key = (current_word, next_next_word)
            
            # Check if the key is not in the transition_probs dictionary, add it if not
            if transition_key not in transition_probs:
                transition_probs[transition_key] = {}
            
            # Check if the next_word is not in the transition_probs dictionary under the given key, add it if not
            if next_word not in transition_probs[transition_key]:
                transition_probs[transition_key][next_word] = 1
            else:
                # Increment the count of the next_word under the given key
                transition_probs[transition_key][next_word] += 1

In [9]:
transition_probs

{('Ad', 'boost'): {'sales': 1},
 ('sales', 'Time'): {'boost': 1},
 ('boost', 'Warner'): {'Time': 1},
 ('Time', 'profit'): {'Warner': 1},
 ('Quarterly', 'at'): {'profits': 1},
 ('profits', 'US'): {'at': 1},
 ('at', 'media'): {'US': 1},
 ('US', 'giant'): {'media': 1,
  'telecoms': 1,
  'banking': 2,
  'foods': 1,
  'retail': 1,
  'oil': 2,
  'mortgage': 1,
  'agrochemical': 1},
 ('media', 'TimeWarner'): {'giant': 1},
 ('giant', 'jumped'): {'TimeWarner': 1},
 ('TimeWarner', '76'): {'jumped': 1},
 ('jumped', '%'): {'76': 1,
  '1.8': 1,
  '11': 1,
  '6': 1,
  '10.7': 1,
  '7': 1,
  '22': 1},
 ('76', 'to'): {'%': 1},
 ('%', '$'): {'to': 17, 'at': 5},
 ('to', '1.13bn'): {'$': 1},
 ('$', '('): {'1.13bn': 1,
  '900m': 1,
  '280bn': 3,
  '86m': 1,
  '20bn': 2,
  '671.7bn': 1,
  '27.5bn': 4,
  '32bn': 1,
  '4bn': 1,
  '2bn': 3,
  '11bn': 5,
  '850m': 1,
  '300bn': 1,
  '186m': 1,
  '3.25bn': 1,
  '6bn': 3,
  '546m': 1,
  '16.6bn': 1,
  '11.5bn': 1,
  '3bn': 1,
  '102.6bn': 1,
  '427bn': 2,
  '676

# Normalize The Probabilities

In [10]:
# Iterate through the items in the 'transition_probs' dictionary
for key, inner_dict in transition_probs.items():
    # Calculate the total count of values in the inner dictionary
    total_count = sum(inner_dict.values())
    
    # Iterate through the items in the inner dictionary
    for inner_key, inner_value in inner_dict.items():
        # Update the inner dictionary values by dividing them with the total count
        inner_dict[inner_key] = inner_value / total_count

In [None]:
transition_probs

In [12]:
texts.iloc[0].split('\n')

['Ad sales boost Time Warner profit',
 '',
 'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.',
 '',
 'The firm, which is now one of the biggest investors in Google, benefited from sales of high-speed internet connections and higher advert sales. TimeWarner said fourth quarter sales rose 2% to $11.1bn from $10.9bn. Its profits were buoyed by one-off gains which offset a profit dip at Warner Bros, and less users for AOL.',
 '',
 "Time Warner said on Friday that it now owns 8% of search-engine Google. But its own internet business, AOL, had has mixed fortunes. It lost 464,000 subscribers in the fourth quarter profits were lower than in the preceding three quarters. However, the company said AOL's underlying profit before exceptional items rose 8% on the back of stronger internet advertising revenues. It hopes to increase subscribers by offering the online service free to TimeWarner internet customers a

# High Level Spinning, Spinning Article Paragraphs

In [13]:
def spin_document(document):
    """
    Spin the lines of a document using the spin_line function.
    
    Args:
    document (str): Input document as a string.
    
    Returns:
    str: Spun document with lines processed by spin_line.
    """
    # Split the document into lines
    lines = document.split('\n')
    output = []
    
    # Iterate through the lines in the document
    for line in lines:
        # Check if the line is not empty
        if line:
            # Process the line using the spin_line function
            new_line = spin_line(line)
        else:
            # If the line is empty, keep it unchanged
            new_line = line
        # Add the processed line to the output list
        output.append(new_line)
    
    # Join the output list into a single string with newline characters
    return '\n'.join(output)

# Building Detokenizer

In [14]:
detokenizer = TreebankWordDetokenizer()

In [15]:
texts.iloc[0].split('\n')[2]

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.'

In [16]:
detokenizer.detokenize(word_tokenize(texts.iloc[0].split('\n')[2]))

'Quarterly profits at US media giant TimeWarner jumped 76% to $1.13bn (£600m) for the three months to December, from $639m year-earlier.'

In [17]:
def sample_word(probabilities):
    """
    Sample a word from a dictionary of probabilities.
    
    Args:
    probabilities (dict): A dictionary where keys are words and values are corresponding probabilities.
    
    Returns:
    str: A randomly sampled word based on the provided probabilities.
    """
    # Generate a random probability between 0 and 1
    random_prob = np.random.random()
    cumulative_prob = 0
    
    # Iterate through the items in the dictionary
    for word, prob in probabilities.items():
        # Add the probability of the current word to the cumulative probability
        cumulative_prob += prob
        
        # Check if the random probability is less than or equal to the cumulative probability
        if random_prob < cumulative_prob:
            # Return the current word if the condition is met
            return word
    
    # If the function reaches this point, raise an assertion error (should never get here)
    assert False, "Unexpected state: Random word not sampled based on probabilities"

In [18]:
def spin_line(input_line):
    """
    Spin the words in a line using the probabilities from the 'probs' dictionary.
    
    Args:
    input_line (str): Input line as a string.
    
    Returns:
    list: List of spun words.
    """
    # Tokenize the input line into words
    tokens = word_tokenize(input_line)
    index = 0
    spun_words = [tokens[0]]
    
    # Iterate through the tokens, excluding the last two words
    while index < (len(tokens) - 2):
        word_0 = tokens[index]
        word_1 = tokens[index + 1]
        word_2 = tokens[index + 2]
        key = (word_0, word_2)
        probability_distribution = transition_probs[key]
        
        # Check if there are multiple options and a random condition is met
        if len(probability_distribution) > 1 and np.random.random() < 0.3:
            middle_word = sample_word(probability_distribution)
            
            # Add the words and a placeholder for the middle word to the spun_words list
            spun_words.append(word_1)
            spun_words.append('<' + middle_word + '>')
            spun_words.append(word_2)
            
            index += 2
        else:
            # If the random condition is not met, keep the original words in the spun_words list
            spun_words.append(word_1)
            index += 1

    if index == len (tokens) - 2 :
        spun_words.append(tokens[-1])

    return detokenizer.detokenize(spun_words)

In [19]:
np.random.seed(1234)

In [20]:
# Choose a random index from the 'texts' array
random_index = np.random.choice(texts.shape[0])

# Retrieve the document at the selected index
selected_doc = texts.iloc[random_index]

# Apply the 'spin_document' function to the selected document
spun_doc = spin_document(selected_doc)

In [21]:
print(textwrap.fill(
    spun_doc, replace_whitespace=False, fix_sentence_endings=True
))

Bombardier chief to leave company

Shares in train and plane-making
giant Bombardier have fallen to <to> a 10-year low following <against>
the departure <hands> of its chief executive and two members of the
<key> board.

Paul Tellier, who <which> was also Bombardier's
president <epicentre>, left the company amid an ongoing <£80m>
restructuring . Laurent Beaudoin, part of the family that controls the
Montreal-based firm, will take on <over> the role of CEO under a newly
created management structure . Analysts said <believe> the
resignations seem to have stemmed from a boardroom dispute . Under Mr
Tellier's tenure at the company <subsidy>, which began in January
<July> 2003, plans <according> to cut the worldwide workforce of
75,000 by almost <signing> a third <movement> by 2006 were announced .
The firm's snowmobile <auto> division and defence services unit were
also sold and Bombardier started the development <future> of a new
aircraft seating 110 to 135 passengers.

Mr Tellier had ind