In [19]:
import pandas as pd
import numpy as np
import emoji
import re

llama3csv = pd.read_csv("../../datasets/GeneratedDatasets/responsesLlama.csv")

In [20]:
#drop every row in llama3csv in which the text in row "Generated Response" starts with I cannot or I can't. 

llama3csv = llama3csv[~llama3csv["Generated Response"].str.startswith("I cannot")]
llama3csv = llama3csv[~llama3csv["Generated Response"].str.startswith("I can't")]


In [21]:
# function to cut out the part before the colon if the output starts with "Here"
def cutBeforeColon(text):
    if isinstance(text, str) and text.startswith("Here"):
        parts = text.split(':', 1)
        if len(parts) > 1:
            return parts[1].strip()
    return text


In [22]:
# apply the function to the "Generated Response" column
data = llama3csv['Generated Response'].apply(cutBeforeColon)
llama3csv['Generated Response'] = data


In [23]:
#convert emojis to text
import emoji

def replace_emojis_with_text(text):
        # Entfernen von Variation Selectors (VS15 und VS16) und Hautfarbmodifikatoren (1F3FB-1F3FF)
    text = re.sub(r'[\U0001F3FB-\U0001F3FF\uFE0F]', '', text)
    # Verwenden von emoji.demojize zur Konvertierung in Text
    text = emoji.demojize(text)
    text = re.sub(r':', ' ', text)
    text = re.sub(r'_', '', text)
    return text

llama3csv["Generated Response"] = llama3csv["Generated Response"].apply(replace_emojis_with_text)
llama3csv["Generated Response"] = llama3csv["Generated Response"].apply(replace_emojis_with_text)


In [24]:
#remove all the quotes from the columns
llama3csv["Generated Response"] = llama3csv["Generated Response"].str.replace('"', "").replace("'", "")
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].str.replace('"', "").replace("'", "")

In [25]:
#remove urls from both columns

llama3csv["Generated Response"] = llama3csv["Generated Response"].str.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].str.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

In [26]:
#remove every tagging of a user
llama3csv["Generated Response"] = llama3csv["Generated Response"].str.replace(r'@\S+', '', regex=True)
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].str.replace(r'@\S+', '', regex=True)

In [27]:
#remove every newline character
llama3csv["Generated Response"] = llama3csv["Generated Response"].str.replace(r'\n', ' ', regex=True)
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].str.replace(r'\n', ' ', regex=True)

In [28]:
#remove every tab character
llama3csv["Generated Response"] = llama3csv["Generated Response"].str.replace(r'\t', ' ', regex=True)
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].str.replace(r'\t', ' ', regex=True)

In [29]:
#remove every special character
llama3csv["Generated Response"] = llama3csv["Generated Response"].str.replace(r'[^\w\s]', '', regex=True)
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].str.replace(r'[^\w\s]', '', regex=True)

In [30]:
#remove every number
llama3csv["Generated Response"] = llama3csv["Generated Response"].str.replace(r'\d+', '', regex=True)
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].str.replace(r'\d+', '', regex=True)

In [31]:
#all to lower case
llama3csv["Generated Response"] = llama3csv["Generated Response"].str.lower()
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].str.lower()

In [32]:
#remove every double whitespace
llama3csv["Generated Response"] = llama3csv["Generated Response"].str.replace(r'\s+', ' ', regex=True)
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].str.replace(r'\s+', ' ', regex=True)

In [33]:
#remove stopwords
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop = stopwords.words('english')

llama3csv["Generated Response"] = llama3csv["Generated Response"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/romanoelfken/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.tag import pos_tag

# Download necessary NLTK data files
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    tokens = word_tokenize(text)
    tagged_tokens = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(token, get_wordnet_pos(tag)) for token, tag in tagged_tokens]
    return ' '.join(lemmatized_tokens)

llama3csv["Generated Response"] = llama3csv["Generated Response"].apply(lemmatize_text)
llama3csv["Original Tweet"] = llama3csv["Original Tweet"].apply(lemmatize_text)



[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/romanoelfken/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/romanoelfken/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [35]:
#set 1 for artificial and 0 for original and concatenate the two columns
gr = llama3csv["Generated Response"].copy()
ot = llama3csv["Original Tweet"].copy()

gr_df = pd.DataFrame({'text': gr, 'label': 1})
ot_df = pd.DataFrame({'text': ot, 'label': 0})


combined_df = pd.concat([gr_df, ot_df], ignore_index=True, sort=False)

llama3csv = combined_df.sample(frac=1).reset_index(drop=True)

In [36]:
llama3csv.to_csv("../StorePreprocessed/Llama3csv.csv", index=False)