### Importing the pickle dataframe files

In [1]:
import pandas as pd
import re

ai_df = pd.read_pickle("AI_Generated_data.pkl")
human_df = pd.read_pickle("Human_Written_data.pkl")

### Cleaning the Text

In [2]:
# Function to clean text
def clean_text(text):
    # Removing non-standard characters, keeping letters, numbers, basic punctuation and spaces
    text = re.sub(r'[^a-zA-Z0-9,.!? ]+', '', text)
    text = re.sub(r'(\d+)', r' \1 ', text)
    # Making large spaces to a single space
    text = re.sub(r'\s+', ' ', text)
    # Trimming any leading and trailing spaces
    text = text.strip()
    return text

# Cleaning text in AI generated format
ai_df['ai_text'] = ai_df['ai_text'].apply(clean_text)
human_df['text'] = human_df['text'].apply(clean_text)

In [3]:
ai_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1045 entries, 0 to 1044
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1045 non-null   int64 
 1   ai_text  1045 non-null   object
 2   label    1045 non-null   object
dtypes: int64(1), object(2)
memory usage: 24.6+ KB


In [4]:
human_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1045 entries, 0 to 1060
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   url     1045 non-null   object
 1   title   1045 non-null   object
 2   text    1045 non-null   object
 3   label   1045 non-null   object
 4   id      1045 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 49.0+ KB


In [5]:
ai_df

Unnamed: 0,id,ai_text,label
0,1,"In the aftermath of the Nakaba, the Palestinia...",AI-written
1,2,The Rafal crossing is a major source of humani...,AI-written
2,3,Hezbollah has also said that it has launched r...,AI-written
3,4,"A number of people were injured, including a w...",AI-written
4,5,"Nadeem Anjarwalla, the regiona of Nigerias cap...",AI-written
...,...,...,...
1040,1041,The film is an exploration of how the human mi...,AI-written
1041,1042,The film is a gripping and moving portrait of ...,AI-written
1042,1043,,AI-written
1043,1044,Director Frank Darabont has made a prison fabl...,AI-written


In [6]:
human_df

Unnamed: 0,url,title,text,label,id
0,https://www.aljazeera.com/opinions/2024/3/27/b...,Beit Daras and Gaza: An intergenerational tale...,"On this day 76 years ago, my ancestral village...",Human-written,1
1,https://www.aljazeera.com/news/2024/3/27/israe...,"Israel’s war on Gaza: List of key events, day 173","Heres how things stand on Wednesday,March 27 ,...",Human-written,2
2,https://www.aljazeera.com/news/2024/3/27/hezbo...,Hezbollah launches rocket barrage after Israel...,Hezbollah has said it launched dozens of rocke...,Human-written,3
3,https://www.aljazeera.com/features/2024/3/26/s...,South Sudan on the brink after oil exports der...,Violence and insecurity could worsen in South ...,Human-written,4
4,https://www.aljazeera.com/news/2024/3/25/binan...,Binance executive detained in Nigeria in crypt...,An executive of cryptocurrency exchange Binanc...,Human-written,5
...,...,...,...,...,...
1056,https://www.imdb.com/title/tt0111161/reviews,Finding Beauty in the Shadows,It is a cinematic masterpiece that delves into...,Human-written,1041
1057,https://www.imdb.com/title/tt0111161/reviews,The Shawshank Journey,It takes audiences on an unforgettable cinemat...,Human-written,1042
1058,https://www.imdb.com/title/tt0111161/reviews,A Timeless Tale of Hope and Resilience,It is a timeless tale of hope and resilience t...,Human-written,1043
1059,https://www.imdb.com/title/tt0111161/reviews,Freeman gives it depth,Andy Dufresne Tim Robbins is a banker convicte...,Human-written,1044


### Capping the max. words per text to the max. AI-Written text word count (to keep the inputs balanced)

In [7]:
max_ai_text_length = ai_df['ai_text'].str.len().max()
max_ai_text_word_count = ai_df['ai_text'].apply(lambda x: len(x.split())).max()

print(f"Maximum AI-Written text length (characters): {max_ai_text_length}")
print(f"Maximum AI-Written text word count: {max_ai_text_word_count}")

Maximum AI-Written text length (characters): 5117
Maximum AI-Written text word count: 862


In [8]:
# Function to shorten texts to a maximum number of words
def shorten_text_to_max_words(text, max_words):
    words = text.split()
    return ' '.join(words[:max_words])

# Shortening the texts in human_df to a rounded number of the maximum word count found in AI-Written texts - 900 words
human_df['text'] = human_df['text'].apply(lambda x: shorten_text_to_max_words(x, 900))

In [9]:
ai_df_renamed = ai_df.rename(columns={'ai_text': 'text'})
human_df_renamed = human_df[['text', 'label']]  # Selecting only the 'text' and 'label' columns

merged_df = pd.concat([ai_df_renamed[['text', 'label']], human_df_renamed], ignore_index=True)

In [10]:
merged_df

Unnamed: 0,text,label
0,"In the aftermath of the Nakaba, the Palestinia...",AI-written
1,The Rafal crossing is a major source of humani...,AI-written
2,Hezbollah has also said that it has launched r...,AI-written
3,"A number of people were injured, including a w...",AI-written
4,"Nadeem Anjarwalla, the regiona of Nigerias cap...",AI-written
...,...,...
2085,It is a cinematic masterpiece that delves into...,Human-written
2086,It takes audiences on an unforgettable cinemat...,Human-written
2087,It is a timeless tale of hope and resilience t...,Human-written
2088,Andy Dufresne Tim Robbins is a banker convicte...,Human-written


### Checking for any missing values and duplicates

In [11]:
duplicate_rows = merged_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

Number of duplicate rows: 25


In [12]:
merged_df = merged_df.drop_duplicates() 

In [13]:
duplicate_rows = merged_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicate_rows}")

Number of duplicate rows: 0


In [14]:
values_missing = merged_df.isnull().sum()
print(values_missing)

text     0
label    0
dtype: int64


In [15]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2065 entries, 0 to 2089
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    2065 non-null   object
 1   label   2065 non-null   object
dtypes: object(2)
memory usage: 48.4+ KB


### Exporting the Final Dataframe

In [16]:
merged_df.to_pickle("Final_Dataset.pkl")