In [None]:
!pip uninstall nltk -y
!pip install nltk



Found existing installation: nltk 3.9.1
Uninstalling nltk-3.9.1:
  Successfully uninstalled nltk-3.9.1
Collecting nltk
  Using cached nltk-3.9.1-py3-none-any.whl.metadata (2.9 kB)
Using cached nltk-3.9.1-py3-none-any.whl (1.5 MB)
Installing collected packages: nltk
Successfully installed nltk-3.9.1


In [None]:
import os
import pandas as pd
import re
import string
import nltk
from nltk.tokenize import simple


nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer


extracted_files = [
    "/content/Leroy sane_extracted.txt",
    "/content/Marco asensio_extracted.txt",
    "/content/Oussmane dembele_extracted.txt",
]

# Define a text cleaning function
def clean_text(text):
    text = text.lower()  # convert to lowercase
    text = re.sub(r'\d+', '', text)  # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove punctuation
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra whitespace
    return text

# Define a simple tokenization function using split
def tokenize_text(text):
    return text.split()

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment scores
def sentiment_scores(text):
    return sia.polarity_scores(text)

# Process each extracted file individually
for file_name in extracted_files:
    if os.path.exists(file_name):
        print(f"\nProcessing file: {file_name}")

        # Read the file content
        with open(file_name, 'r', encoding='utf-8') as file:
            content = file.read()

        # Clean and tokenize the text
        cleaned = clean_text(content)
        tokens = tokenize_text(cleaned)
        sentiment = sentiment_scores(cleaned)

        # Prepare a DataFrame to store the processed data
        df_processed = pd.DataFrame({
            "FileName": [file_name],
            "OriginalText": [content],
            "CleanedText": [cleaned],
            "Tokens": [tokens],
            "Sentiment": [sentiment]
        })

        # Define the output CSV file name (saves in the current directory)
        output_file = f"/content/processed_{os.path.basename(file_name).replace('.txt', '')}.csv"

        # Save the DataFrame to CSV
        df_processed.to_csv(output_file, index=False)
        print(f"Processed data saved to '{output_file}'")
    else:
        print(f"File not found: {file_name}")



Processing file: /content/Leroy sane_extracted.txt
Processed data saved to '/content/processed_Leroy sane_extracted.csv'

Processing file: /content/Marco asensio_extracted.txt
Processed data saved to '/content/processed_Marco asensio_extracted.csv'

Processing file: /content/Oussmane dembele_extracted.txt
Processed data saved to '/content/processed_Oussmane dembele_extracted.csv'


[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [7]:
import os
import pandas as pd
import re
import string
import spacy
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('vader_lexicon')

# Load the spaCy model
nlp = spacy.load("en_core_web_sm")

# Define a text cleaning and preprocessing function
def preprocess_text(text):
    # Normalize text: lowercase, remove punctuation, remove numbers, handle contractions
    text = text.lower()
    text = re.sub(r'\d+', '', text)
    text = re.sub(r"\'s", " is", text)  # Example contraction
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\s+', ' ', text).strip()

    # Lemmatization, removing stopwords, and extracting original tokens
    doc = nlp(text)
    lemmas = []
    original_tokens = []
    for token in doc:
        if not token.is_stop and token.lemma_ != '-PRON-':
            lemmas.append(token.lemma_)
        if not token.is_punct:
            original_tokens.append(token.text)  # Collecting original tokens

    return " ".join(lemmas), original_tokens

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Function to compute sentiment scores
def sentiment_scores(text):
    return sia.polarity_scores(text)

# List of extracted text file names
extracted_files = [
    "/content/Leroy sane_extracted.txt",
    "/content/Marco asensio_extracted.txt",
    "/content/Oussmane dembele_extracted.txt",
    "/content/Untitled Folder/extracted_603010077-Neymar-Jr (1).txt",
    "/content/Untitled Folder/extracted_Young_Kings_Marcus_Rashford_and_Theopolitical_Char (1).txt",
    "/content/Untitled Folder/extracted_pdf.raphinha (1).txt"
]

# Process each extracted file individually
for file_name in extracted_files:
    if os.path.exists(file_name):
        print(f"\nProcessing file: {file_name}")

        # Read the file content
        with open(file_name, 'r', encoding='utf-8') as file:
            content = file.read()

        # Clean, lemmatize, remove stopwords, and tokenize
        preprocessed_text, tokens = preprocess_text(content)
        sentiment = sentiment_scores(preprocessed_text)

        # Prepare a DataFrame to store the processed data
        df_processed = pd.DataFrame({
            "FileName": [file_name],
            "OriginalText": [content],
            "PreprocessedText": [preprocessed_text],
            "Tokens": [tokens],
            "Sentiment": [sentiment]
        })

        # Define the output CSV file name
        output_file = f"/content/processed_Lemmatization_{os.path.basename(file_name).replace('.txt', '')}.csv"

        # Save the DataFrame to CSV
        df_processed.to_csv(output_file, index=False)
        print(f"Processed data saved to '{output_file}'")
    else:
        print(f"File not found: {file_name}")





[nltk_data] Downloading package vader_lexicon to /root/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!



Processing file: /content/Leroy sane_extracted.txt
Processed data saved to '/content/processed_Lemmatization_Leroy sane_extracted.csv'

Processing file: /content/Marco asensio_extracted.txt
Processed data saved to '/content/processed_Lemmatization_Marco asensio_extracted.csv'

Processing file: /content/Oussmane dembele_extracted.txt
Processed data saved to '/content/processed_Lemmatization_Oussmane dembele_extracted.csv'

Processing file: /content/Untitled Folder/extracted_603010077-Neymar-Jr (1).txt
Processed data saved to '/content/processed_Lemmatization_extracted_603010077-Neymar-Jr (1).csv'

Processing file: /content/Untitled Folder/extracted_Young_Kings_Marcus_Rashford_and_Theopolitical_Char (1).txt
Processed data saved to '/content/processed_Lemmatization_extracted_Young_Kings_Marcus_Rashford_and_Theopolitical_Char (1).csv'

Processing file: /content/Untitled Folder/extracted_pdf.raphinha (1).txt
Processed data saved to '/content/processed_Lemmatization_extracted_pdf.raphinha (