<a href="https://colab.research.google.com/github/shreel143/TweekfakeLLM/blob/main/TweepfakeAndLLMs_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Importing dataset from Kaggle

In [None]:
from google.colab import files
files.upload()

In [2]:
!pip install kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



In [7]:
!kaggle datasets download -d mtesconi/twitter-deep-fake-text

Downloading twitter-deep-fake-text.zip to /content
  0% 0.00/345k [00:00<?, ?B/s]
100% 345k/345k [00:00<00:00, 77.1MB/s]


In [8]:
!unzip twitter-deep-fake-text.zip

Archive:  twitter-deep-fake-text.zip
  inflating: test.csv                
  inflating: train.csv               
  inflating: validation.csv          


## Data Pre-Processing:




In [9]:
import pandas as pd

In [10]:
# Load datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
validation_df = pd.read_csv('validation.csv')

In [13]:
# concatenate the datasets into a single DataFrame
combined_df = pd.concat([train_df, test_df, validation_df]).reset_index(drop=True)

In [21]:
print("CONCATENATED DATASET INFO:")
print(combined_df.info())

CONCATENATED DATASET INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25572 entries, 0 to 25571
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       25572 non-null  object
 1   status_id     25572 non-null  object
 2   screen_name   25572 non-null  object
 3   account.type  25572 non-null  object
 4   class_type    25572 non-null  object
dtypes: object(5)
memory usage: 999.0+ KB
None


In [15]:
print(combined_df)

                   user_id            status_id screen_name account.type  \
0      1110407881030017024  1208265880146046976       bot#9          bot   
1               3171109449  1091463908118941696    human#17        human   
2      1110686081341632512  1199055191028293633      bot#23          bot   
3      1110307772783124480  1214698264701722626       bot#1          bot   
4       979586167405363200  1209229478934695937      bot#11          bot   
...                    ...                  ...         ...          ...   
25567   705113652471439361   714523361305608192      bot#16          bot   
25568            262794965   935057601103933441     human#8        human   
25569            343587159  1158520796039405569     human#1        human   
25570  1110407881030017024  1210364706457677824       bot#9          bot   
25571  1213988022728810496  1219363974657052675       bot#8          bot   

      class_type  
0         others  
1          human  
2         others  
3         o

In [16]:
# Filter for AI-generated texts
ai_texts = combined_df[combined_df['account.type'] == 'bot'].sample(n=1000, random_state=1)

# Filter for human-generated texts
human_texts = combined_df[combined_df['account.type'] == 'human'].sample(n=1000, random_state=1)

# Combine the filtered subsets
filtered_df = pd.concat([ai_texts, human_texts]).reset_index(drop=True)

In [18]:
print("FILTERED DATASET INFO:")
print(filtered_df.info())
print(filtered_df)

FILTERED DATASET INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       2000 non-null   object
 1   status_id     2000 non-null   object
 2   screen_name   2000 non-null   object
 3   account.type  2000 non-null   object
 4   class_type    2000 non-null   object
dtypes: object(5)
memory usage: 78.2+ KB
None
                  user_id            status_id screen_name account.type  \
0      979586167405363200  1183560338110681088      bot#11          bot   
1     1174980053668524033  1205648849483845632       bot#4          bot   
2      901463089622781952   942333693074464770      bot#10          bot   
3      901463089622781952  1070175417405853696      bot#10          bot   
4      979586167405363200  1141462951041220608      bot#11          bot   
...                   ...                  ...         ...          ...   
1995  

In [19]:
# Converting the filtered dataset into csv file
filtered_df.to_csv('filtered_dataset.csv', index=False)

In [20]:
# Downloading the filtered dataset containing 1k enteries of each type
from google.colab import files
files.download('filtered_dataset.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Data Cleaning And Preperation:

In [22]:
# Displaying unique values in the 'account.type' column
unique_values = filtered_df['account.type'].unique()
print(unique_values)

['bot' 'human']


In [23]:
# Defining a mapping dictionary that covers all variations found in the unique values
label_mapping = {
    'human': 0,
    'bot': 1,
}

In [24]:
# Applying the mapping to the 'account.type' column
filtered_df['account.type'] = filtered_df['account.type'].map(label_mapping)

In [33]:
print("FILTERED DATASET INFO:")
print(filtered_df.info())
print(filtered_df)

FILTERED DATASET INFO:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   user_id       2000 non-null   object
 1   status_id     2000 non-null   object
 2   screen_name   2000 non-null   object
 3   account.type  2000 non-null   int64 
 4   class_type    2000 non-null   object
dtypes: int64(1), object(4)
memory usage: 78.2+ KB
None
                  user_id            status_id screen_name  account.type  \
0      979586167405363200  1183560338110681088      bot#11             1   
1     1174980053668524033  1205648849483845632       bot#4             1   
2      901463089622781952   942333693074464770      bot#10             1   
3      901463089622781952  1070175417405853696      bot#10             1   
4      979586167405363200  1141462951041220608      bot#11             1   
...                   ...                  ...         ...       

In [44]:
# Downloading this new filtered dataset with mapping
filtered_df.to_csv('filtered_dataset.csv', index=False)



In [None]:
from google.colab import files
files.download('filtered_dataset.csv')

## Splitting the dataset into train, test and validation

In [45]:
# Splitting the data into train, validation, and test sets

from sklearn.model_selection import train_test_split

In [49]:
X = filtered_df.drop(columns=['account.type'])  # Features (input)
y = filtered_df['account.type']  # Labels (output), already encoded as 0 and 1

# Spliting data into training and remaining data (70% training, 30% for val and test)
X_train, X_remaining, y_train, y_remaining = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# Spliting remaining data into validation and test sets (50% each of remaining data)
X_val, X_test, y_val, y_test = train_test_split(X_remaining, y_remaining, test_size=0.5, stratify=y_remaining, random_state=42)


In [50]:
print("FEATURES INFO (X):")
print(X)
print(X.info())

FEATURES INFO (X):
                  user_id            status_id screen_name class_type
0      979586167405363200  1183560338110681088      bot#11        rnn
1     1174980053668524033  1205648849483845632       bot#4       gpt2
2      901463089622781952   942333693074464770      bot#10     others
3      901463089622781952  1070175417405853696      bot#10     others
4      979586167405363200  1141462951041220608      bot#11        rnn
...                   ...                  ...         ...        ...
1995           3171109449  1037895376819372034    human#17      human
1996   721352073925627907  1212463519922151425     human#7      human
1997            343587159  1155018440727339009     human#1      human
1998   721352073925627907  1271835676154253313     human#7      human
1999             25073877  1206335971974959107    human#13      human

[2000 rows x 4 columns]
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 4 columns):
 #   Colum

In [51]:
print("LABELS INFO (y):")
print(y)
print(y.info())

LABELS INFO (y):
0       1
1       1
2       1
3       1
4       1
       ..
1995    0
1996    0
1997    0
1998    0
1999    0
Name: account.type, Length: 2000, dtype: int64
<class 'pandas.core.series.Series'>
RangeIndex: 2000 entries, 0 to 1999
Series name: account.type
Non-Null Count  Dtype
--------------  -----
2000 non-null   int64
dtypes: int64(1)
memory usage: 15.8 KB
None


In [53]:
# Combining features and labels into DataFrames
train_df = pd.concat([X_train, y_train], axis=1)
val_df = pd.concat([X_val, y_val], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

In [56]:
print("TRAIN SPLIT:")
print(train_df)

TRAIN SPLIT:
                  user_id            status_id screen_name class_type  \
1593             15088390  1080874603608629248    human#10      human   
785   1110686081341632512  1150102628056981504      bot#23     others   
729   1197716977982177280  1201000163218804736      bot#17       gpt2   
403   1197916267975335939  1206504620710879232      bot#12        rnn   
890   1110407881030017024  1213429905486073856       bot#9     others   
...                   ...                  ...         ...        ...   
962   1110407881030017024  1195295412510326784       bot#9     others   
1452             18839785  1173916332880007170    human#11      human   
719   1218019760669020160  1221160823239987202      bot#13       gpt2   
51     979586167405363200  1167283084233863169      bot#11        rnn   
311   1110686081341632512  1153364116553097219      bot#23     others   

      account.type  
1593             0  
785              1  
729              1  
403              1  
890  

In [57]:
print("TEST SPLIT:")
print(train_df)

TEST SPLIT:
                  user_id            status_id screen_name class_type  \
1593             15088390  1080874603608629248    human#10      human   
785   1110686081341632512  1150102628056981504      bot#23     others   
729   1197716977982177280  1201000163218804736      bot#17       gpt2   
403   1197916267975335939  1206504620710879232      bot#12        rnn   
890   1110407881030017024  1213429905486073856       bot#9     others   
...                   ...                  ...         ...        ...   
962   1110407881030017024  1195295412510326784       bot#9     others   
1452             18839785  1173916332880007170    human#11      human   
719   1218019760669020160  1221160823239987202      bot#13       gpt2   
51     979586167405363200  1167283084233863169      bot#11        rnn   
311   1110686081341632512  1153364116553097219      bot#23     others   

      account.type  
1593             0  
785              1  
729              1  
403              1  
890   

In [58]:
print("VALIDATION SPLIT:")
print(val_df)

VALIDATION SPLIT:
                  user_id            status_id screen_name class_type  \
897   1110407881030017024  1175590571194253314       bot#9     others   
765   1148250906166726656  1199866315504308224       bot#2       gpt2   
521   1174980053668524033  1206879709826617344       bot#4       gpt2   
421   1174980053668524033  1228924470611652608       bot#4       gpt2   
937    979586167405363200  1182261782322139136      bot#11        rnn   
...                   ...                  ...         ...        ...   
987   1174980053668524033  1225310825122074624       bot#4       gpt2   
1240             15088390  1022500549084602373    human#10      human   
502    901463089622781952   941004935193169920      bot#10     others   
1803  1033142532136415232  1157009060681736192     human#2      human   
395   1110407881030017024  1200474537223000064       bot#9     others   

      account.type  
897              1  
765              1  
521              1  
421              1  


In [59]:
# Save to CSV files
train_df.to_csv('/content/final_train.csv', index=False)
val_df.to_csv('/content/final_validation.csv', index=False)
test_df.to_csv('/content/final_test.csv', index=False)

In [60]:
# Downloading the CSV files
!zip -r /content/final_data_splits.zip /content/final_train.csv /content/final_validation.csv /content/final_test.csv


  adding: content/final_train.csv (deflated 73%)
  adding: content/final_validation.csv (deflated 70%)
  adding: content/final_test.csv (deflated 70%)


In [61]:
from google.colab import files
files.download('/content/final_data_splits.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>