In [60]:
import pandas as pd
import os
import sys
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [61]:

#File Path given by folder_path = r"r"D:\Innomatics\text"

folder_path = r"D:\Innomatics\text"
# Initialize an empty list to store the data
all_data = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        
        # Open and read the file
        with open(file_path, 'r') as file:
            content = file.read().strip()
            # Append the filename and content to the list
            all_data .append([filename, content])

# Convert the list to a DataFrame
df = pd.DataFrame(all_data , columns=['Filename', 'Content'])

# Split and create a Label Column
df["label"]=df["Content"].apply(lambda x:x.split("\n")[0])

# Split and create a text Column
df["text"]=df["Content"].apply(lambda x:x.split("\n")[1])

# Drop the unwanted Column
df.drop(["Content","Filename"],axis=1,inplace=True)

# Show the Dataframe
df

Unnamed: 0,label,text
0,MajorClaim,we should attach more importance to cooperatio...
1,Premise,Take Olympic games which is a form of competit...
2,Premise,The high technology and new ideas applied into...
3,Premise,pollutions are not just caused by the burning ...
4,Premise,the improvements of work efficiency also attri...
...,...,...
6084,MajorClaim,addressing pollution and traffic issues only b...
6085,Premise,whether it can work out for alleviating traffi...
6086,Premise,price control institution has been used in ple...
6087,Claim,it seems not easy to increase petrol price ins...


In [62]:
df.head()

Unnamed: 0,label,text
0,MajorClaim,we should attach more importance to cooperatio...
1,Premise,Take Olympic games which is a form of competit...
2,Premise,The high technology and new ideas applied into...
3,Premise,pollutions are not just caused by the burning ...
4,Premise,the improvements of work efficiency also attri...


In [63]:
df.tail()

Unnamed: 0,label,text
6084,MajorClaim,addressing pollution and traffic issues only b...
6085,Premise,whether it can work out for alleviating traffi...
6086,Premise,price control institution has been used in ple...
6087,Claim,it seems not easy to increase petrol price ins...
6088,Premise,governments have a macro-economic perspective ...


### Count of 'Premise' label :

In [64]:
df["label"].value_counts()["Premise"]

3832

In [65]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\prabh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

###  Maximum number of character level tokens in a document in raw 'text' column

In [66]:
# Function to tokenize text at the character level
def charc_tokenize(text):
    return list(text)

# Apply the tokenizer to each document and calculate the length of the token list
df['charc_token_count'] = df['text'].apply(lambda x: len(charc_tokenize(x)))

# Find the maximum number of character-level tokens in the 'text' column
maxim_char_tokens = df['charc_token_count'].max()

print(f"Maximum number of character level tokens in a document is: {maxim_char_tokens}")

Maximum number of character level tokens in a document is: 344


### Maximum number of word level tokens in a document in raw 'text' columns

In [67]:
def word_tokenize_text(text):
    # Replace this with your actual tokenization logic
    # (e.g., using NLTK or a custom tokenizer)
    return text.split()  # Simple split for demonstration

# Apply tokenization and calculate word count
df['word_token_count'] = df['text'].apply(lambda x: len(word_tokenize_text(x)))

# Find and print the maximum number of word-level tokens
max_word_tokens = df['word_token_count'].max()
print(f"The maximum number of word-level tokens in a document is: {max_word_tokens}")

The maximum number of word-level tokens in a document is: 67


## Applying Text cleaning
### Steps to follow :
 - Convert to lower
 - word tokenize
 - stop word removal
 - lemmatization

In [68]:
def clean(doc): 
    # doc is a string of text
    
    # Let's define a regex to match special characters and digits
    regex = "[^a-zA-Z.]"
    doc = re.sub(regex, " ", doc)

    # Convert to lowercase
    doc = doc.lower()
        
    # Tokenization
    tokens = nltk.word_tokenize(doc)

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Join and return
    return " ".join(lemmatized_tokens)

In [69]:
df['clean_text'] = df['text'].apply(lambda x : clean(x))

df.head()

Unnamed: 0,label,text,charc_token_count,word_token_count,clean_text
0,MajorClaim,we should attach more importance to cooperatio...,72,10,attach importance cooperation primary education
1,Premise,Take Olympic games which is a form of competit...,297,58,take olympic game form competition instance ha...
2,Premise,The high technology and new ideas applied into...,154,26,high technology new idea applied practice may ...
3,Premise,pollutions are not just caused by the burning ...,137,20,pollution caused burning oil chemical pollutan...
4,Premise,the improvements of work efficiency also attri...,127,19,improvement work efficiency also attribute spe...


### After applying text cleaning (i.e. text pre-processing as mentioned below), what is the maximum number of word level tokens in 'clean_text' column?

In [70]:
df['clean_txt_wrd_tkn'] = df['clean_text'].apply(lambda x: len(word_tokenize_text(x)))

max_clean_txt_wrd_tkn = df['clean_txt_wrd_tkn'].max()

print(f"The maximum number of word-level tokens in a document is: {max_clean_txt_wrd_tkn}")

The maximum number of word-level tokens in a document is: 31


In [71]:
df.head()

Unnamed: 0,label,text,charc_token_count,word_token_count,clean_text,clean_txt_wrd_tkn
0,MajorClaim,we should attach more importance to cooperatio...,72,10,attach importance cooperation primary education,5
1,Premise,Take Olympic games which is a form of competit...,297,58,take olympic game form competition instance ha...,25
2,Premise,The high technology and new ideas applied into...,154,26,high technology new idea applied practice may ...,15
3,Premise,pollutions are not just caused by the burning ...,137,20,pollution caused burning oil chemical pollutan...,13
4,Premise,the improvements of work efficiency also attri...,127,19,improvement work efficiency also attribute spe...,13


In [72]:
df["clean_text"][0]

'attach importance cooperation primary education'

In [73]:
df['clean_text'].apply(lambda x: len(word_tokenize_text(x)))[0]

5

In [74]:
pip install --upgrade scikit-learn




### Applying  Vectorization using Bag of Word Technique(with default parameters) on the raw text column:

In [75]:

# import feature extraction methods from sklearn
from sklearn.feature_extraction.text import CountVectorizer

# instantiate a vectoriezer
bow_vect = CountVectorizer()

# use it to extract features from training data
text_dtm = bow_vect.fit_transform(df['clean_text'])
vocab = bow_vect.get_feature_names()
one_gram_vocab = [word for word in vocab if len(word.split()) == 1]
num_one_gram_words = len(one_gram_vocab)
total_unique_words = len(vocab)

print(f"Shape of text_dtm (# of docs, # of unique vocabulary): {text_dtm.shape}")
print(f"Total number of unique words in the vocabulary (all n-grams): {total_unique_words}")
print(f"Number of 1-gram vocabulary words: {num_one_gram_words}")
print(f"Vocab: {bow_vect.get_feature_names()}")

Shape of text_dtm (# of docs, # of unique vocabulary): (6089, 5973)
Total number of unique words in the vocabulary (all n-grams): 5973
Number of 1-gram vocabulary words: 5973


# Apply Vectorization using Bag of Word Vectorization(with following parameters) on the raw text column.
####   
        1. token_pattern=None
        2. tokenizer=callable 
         - use nltk word tokenizer
        3. ngram_range=(1, **n)
        4. lowercase=false
        5. preprocessor=callable 
         - A valid token should only contain alphanumeric and "."
         - Convert to lower
         - word tokenize
         - stop word removal
         - lemmatization
        6. stop_words=None



In [76]:
def tokenizer(doc):
    return nltk.word_tokenize(doc)

In [77]:

def get_vocabulary_size(text, n):

  # instantiate a vectoriezer
    bow_vect = CountVectorizer(token_pattern=None,
                           tokenizer=tokenizer, 
                           ngram_range=(n, n), 
                           lowercase=False, 
                           preprocessor=clean, 
                           stop_words=None)

  # Running BOW on Raw text column 
    rawtext_vect = bow_vect.fit_transform(text)
    vocabulary_size = len(bow_vect.vocabulary_)
    return vocabulary_size

# Get vocabulary size for unigrams (1-gram)
unigram_vocab_size = get_vocabulary_size(df["text"], 1)
print(f"Number of unique unigrams: {unigram_vocab_size}")

# Get vocabulary size for bigrams (2-gram)
bigram_vocab_size = get_vocabulary_size(df["text"], 2)
print(f"Number of unique bigrams: {bigram_vocab_size}")

# Get vocabulary size for trigrams (3-gram)
trigram_vocab_size = get_vocabulary_size(df["text"], 3)
print(f"Number of unique trigrams: {trigram_vocab_size}")



Number of unique unigrams: 5982
Number of unique bigrams: 36041
Number of unique trigrams: 37839
