In [14]:
import os
import csv
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

def read_text_file(file_path):
    """
    Read text content from a file.

    Args:
        file_path (str): Path to the text file.

    Returns:
        str: Content of the text file.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        return file.read()

def create_csv_from_text_files(folder_path, csv_file_path):
    """
    Create a CSV file from text files in a folder.

    Args:
        folder_path (str): Path to the folder containing text files.
        csv_file_path (str): Path to the CSV file to be created.
    """
    with open(csv_file_path, 'w', newline='', encoding='utf-8') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['article_id', 'text', 'category'])
        for file_name in os.listdir(folder_path):
            if file_name.endswith('.txt'):
                try:
                    article_id, category = file_name.split('_')
                    category = category[:-4]
                    text = read_text_file(os.path.join(folder_path, file_name))
                    csv_writer.writerow([article_id, text, category])
                except Exception as e:
                    print(f"Error processing file '{file_name}': {e}")

if __name__ == "__main__":
    # Step 1: Create a structured CSV file from text files
    folder_path = 'BBC_articles'
    csv_file_path = 'bbc_articles.csv'
    create_csv_from_text_files(folder_path, csv_file_path)
    print("CSV file created successfully.")

    # Step 2: Read the structured CSV file into a DataFrame
    df = pd.read_csv(csv_file_path)


CSV file created successfully.


In [15]:
# Step 3: Preprocess text data and perform TF-IDF vectorization
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Function to preprocess text
def preprocess_text(text):
    """
    Preprocess the text data.

    Args:
        text (str): Input text data.

    Returns:
        str: Preprocessed text data.
    """
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Apply text preprocessing
df['processed_text'] = df['text'].apply(preprocess_text)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Limiting to 1000 features
tfidf_features = tfidf_vectorizer.fit_transform(df['processed_text'])

# Convert category labels to numerical values
label_encoder = LabelEncoder()
df['category_label'] = label_encoder.fit_transform(df['category'])

# Step 4: Create a DataFrame with numerical features and labels
df_features = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
df_final = pd.concat([df_features, df['category_label']], axis=1)

# Step 5: Save the new CSV file with numerical features and labels
df_final.to_csv('vectorized_dataset.csv', index=False)
print("New CSV file with numerical features and labels created successfully.")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sanja_cnt0knj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sanja_cnt0knj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


New CSV file with numerical features and labels created successfully.
