In [14]:
def extract(text):
    # Split the text on the last comma to separate the boolean value
    parts = text.rsplit(',', 1)
    # Extract the string part and trim any extra whitespace
    if len(parts) == 2:
        es = parts[0].strip()
        bv = parts[1].strip().lower() == 'true'
    else: # If there's no comma, handle it by returning the entire text and a default false value
        es = text.strip()
        bv = False

    return es, bv

def process_file(filepath, output_filepath):
    results = [] # List to store the results as tuples (extracted_string, boolean_value)
    s = "" 

    with open(filepath, 'r') as file:
        ls = file.readlines()

    # Process each line in the file
    for i, l in enumerate(ls):
        l = l.strip()
        if l:
            s += l + " "
        if (i + 1 == len(ls) or not ls[i + 1].strip()) and s:
            es, bv = extract(s.strip())
            results.append((es, bv))
            s = "" 
    with open(output_filepath, 'w') as outfile:
        for result in results:
            outfile.write(f"{result[0]}, {result[1]}\n")

input_file_path = 'llama2_sarcasm_training_data.txt'
output_file_path = 'output_file.txt'
process_file(input_file_path, output_file_path)



In [15]:
import re

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove URLs that start with www
    text = re.sub(r'www\S+', '', text) 
    # Remove @mentions
    text = re.sub(r'@\w+', '', text)
    # Remove #hashtags
    text = re.sub(r'#\w+', '', text)
    return text

def process_tweets(input_filepath, output_filepath):
    with open(input_filepath, 'r') as file, open(output_filepath, 'w') as outfile:
        for line in file:
            cleaned_line = clean_text(line)
            outfile.write(cleaned_line)
input_file_path = 'output_file.txt'
output_file_path = 'path_to_your_output_file.txt'
process_tweets(input_file_path, output_file_path)


In [17]:
import csv


 # Prepare data for CSV
def process_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    df = []
    for entry in lines:  # Split the string into text and boolean
        text, tv = entry.strip().rsplit(", ", 1)
        tv = "yes" if tv == "True" else "no"
        df.append((text, tv))


# Write to a CSV file
    with open('output.csv', 'w', newline='', encoding='utf-8') as file:
        w = csv.writer(file)
        w.writerow(['Text', 'True/False'])
        for row in df:
            w.writerow(row)

    print("Data has been written to output.csv.")

process_txt_file('output_file.txt')


Data has been written to output.csv.


In [18]:
import csv
import re

def clean_text(text):
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)
    # Remove @mentions
    text = re.sub(r'@\w+', '', text)
    return text.strip()

def process_csv(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as file:
        r = csv.reader(file)
        h = next(r) 
        df = list(r)
# Clean text in the CSV data
    cleaned_data = [(clean_text(row[0]), row[1]) for row in df]
 # Write cleaned data to a new CSV file
    with open(output_file, 'w', newline='', encoding='utf-8') as file:
        w = csv.writer(file)
        w.writerow(h) 
        for row in cleaned_data:
            w.writerow(row)

    print("Cleaned data has been written to", output_file)

process_csv('output.csv', 'cleaned_output.csv')


Cleaned data has been written to cleaned_output.csv


In [2]:
import re
import emoji

def replace_emojis_with_descriptions(text):
    def replace(match):
        emoji_desc = match.group(0)
        try:
            return emoji.demojize(emoji_desc)
        except KeyError:
            return ''  # If emoji not found, replace with an empty string

    emojis = re.compile("["
                       u"\U0001F600-\U0001F64F"  
                       u"\U0001F300-\U0001F5FF"  
                       u"\U0001F680-\U0001F6FF"  
                       u"\U0001F700-\U0001F77F"  
                       u"\U0001F780-\U0001F7FF"  
                       u"\U0001F800-\U0001F8FF"  
                       u"\U0001F900-\U0001F9FF"  
                       u"\U0001FA00-\U0001FA6F" 
                       u"\U0001FA70-\U0001FAFF"  
                       u"\U00002702-\U000027B0"
                       u"\U000024C2-\U0001F251" 
                       "]+", flags=re.UNICODE)
    return emojis.sub(replace, text)

def process_file(input_file_path, output_file_path):
    with open(input_file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    cleaned_lines = [replace_emojis_with_descriptions(line) for line in lines]

    with open(output_file_path, 'w', encoding='utf-8') as file:
        file.writelines(cleaned_lines)

input_path = 'Tweet.txt'
output_path = 'cleaned.txt'
process_file(input_path, output_path)

In [5]:
def expand_contractions(text):
    # Define common contractions and their expansions
    contractions_dict = {
        "ain't": "am not",
        "aren't": "are not",
        "can't": "cannot",
        "could've": "could have",
        "couldn't": "could not",
        "didn't": "did not",
        "doesn't": "does not",
        "don't": "do not",
        "hadn't": "had not",
        "hasn't": "has not",
        "haven't": "have not",
        "he'd": "he would",
        "he'll": "he will",
        "he's": "he is",
        "how'd": "how did",
        "how'll": "how will",
        "how's": "how is",
        "I'd": "I would",
        "I'll": "I will",
        "I'm": "I am",
        "I've": "I have",
        "isn't": "is not",
        "it'd": "it would",
        "it'll": "it will",
        "it's": "it is",
        "let's": "let us",
        "mustn't": "must not",
        "shan't": "shall not",
        "she'd": "she would",
        "she'll": "she will",
        "she's": "she is",
        "should've": "should have",
        "shouldn't": "should not",
        "that's": "that is",
        "there's": "there is",
        "they'd": "they would",
        "they'll": "they will",
        "they're": "they are",
        "they've": "they have",
        "wasn't": "was not",
        "we'd": "we would",
        "we'll": "we will",
        "we're": "we are",
        "we've": "we have",
        "weren't": "were not",
        "what'll": "what will",
        "what're": "what are",
        "what's": "what is",
        "what've": "what have",
        "when's": "when is",
        "where'd": "where did",
        "where's": "where is",
        "who'll": "who will",
        "who's": "who is",
        "won't": "will not",
        "would've": "would have",
        "wouldn't": "would not",
        "you'd": "you would",
        "you'll": "you will",
        "you're": "you are",
        "you've": "you have"
    }

    # Regular expression pattern for finding contractions
    contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

    def expand_match(contraction):
        match = contraction.group(0)
        expanded_contraction = contractions_dict.get(match) if contractions_dict.get(match) else contractions_dict.get(match.lower())
        return expanded_contraction

    # Replace contractions with their expansions
    expanded_text = contractions_re.sub(expand_match, text)
    return expanded_text

# Open final.txt for reading
with open('cleaned.txt', 'r') as infile:
    # Read lines from the file
    lines = infile.readlines()

# Apply expand_contractions() function to each line
expanded_lines = [expand_contractions(line) for line in lines]

# Write the expanded content to expanded_final.txt
with open('final.txt', 'w') as outfile:
    # Write each expanded line to the output file
    for line in expanded_lines:
        outfile.write(line)


In [7]:
import re

# Define a function to remove special characters
def remove_special_characters(text):
    # Remove special characters except for ",", ".", "!", and newline characters
    cleaned_text = re.sub(r'[^\w\s,.!\n]', '', text)
    return cleaned_text

# Open final.txt for reading
with open('final.txt', 'r') as infile:
    # Read lines from the file
    lines = infile.readlines()

# Apply remove_special_characters() function to each line
cleaned_lines = [remove_special_characters(line) for line in lines]

# Write the cleaned content to final_cleaned.txt
with open('final_cleaned.txt', 'w') as outfile:
    # Write each cleaned line to the output file
    for line in cleaned_lines:
        outfile.write(line)
