In [2]:
from google.colab import files


uploaded = files.upload()


Saving bot_detection_data.csv to bot_detection_data.csv


Data Loading and Preprocessing

In [20]:
import pandas as pd
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    return text

# load the data
df = pd.read_csv('/content/bot_detection_data.csv')
df['cleaned_tweet'] = df['Tweet'].apply(clean_text)



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
df.head()

Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags,cleaned_tweet
0,132131,flong,Station activity person against natural majori...,85,1,2353,False,1,Adkinston,2020-05-11 15:29:50,,station activity person against natural majori...
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,True,0,Sanderston,2022-11-26 05:18:10,both live,authority research natural life material staff...
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,True,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead,manage whose quickly especially foot none to g...
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,True,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I,just cover eight opportunity strong policy which
4,704441,noah87,Animal sign six data good or.,26,3,8438,False,1,Camachoville,2020-04-13 21:24:21,foreign mention,animal sign six data good or


In [22]:
#adding our own data in missing hastag rows
#Replaces missing values in the 'Hashtags' column with 'No Hashtag'.
missing_values_mask=df['Hashtags'].isnull()
#print(missing_values_mask)
df.loc[missing_values_mask,'Hashtags']='No Hashtag'#add No Hashtag at missing cell
print(df['Hashtags'])

0                             No Hashtag
1                              both live
2                            phone ahead
3                     ever quickly new I
4                        foreign mention
                      ...               
49995    teach quality ten education any
49996             add walk among believe
49997            onto admit artist first
49998                               star
49999                               home
Name: Hashtags, Length: 50000, dtype: object


In [23]:
#Converts the 'Created At' column to datetime format.

df['Created At']=pd.to_datetime(df['Created At'], format='%Y-%m-%d %H:%M')
print(df['Created At'].dtypes)

datetime64[ns]


In [24]:
# Displays summary statistics for numerical columns in the DataFrame.
df.describe()

Unnamed: 0,User ID,Retweet Count,Mention Count,Follower Count,Bot Label
count,50000.0,50000.0,50000.0,50000.0,50000.0
mean,548890.68054,50.0056,2.51376,4988.60238,0.50036
std,259756.681425,29.18116,1.708563,2878.742898,0.500005
min,100025.0,0.0,0.0,0.0,0.0
25%,323524.25,25.0,1.0,2487.75,0.0
50%,548147.0,50.0,3.0,4991.5,1.0
75%,772983.0,75.0,4.0,7471.0,1.0
max,999995.0,100.0,5.0,10000.0,1.0


In [25]:
# Assuming you have a DataFrame named 'data' with multiple columns

# List of columns to drop
columns_to_drop = ['User ID', 'Retweet Count', 'Mention Count','Follower Count','Verified','Bot Label','Hashtags']

# Drop the specified columns from the DataFrame
data_dropped = df.drop(columns=columns_to_drop)

# Get value counts for each remaining column
column_counts = {}
for column in data_dropped.columns:
    column_counts[column] = data_dropped[column].value_counts()

# Print the value counts for each column
for column, counts in column_counts.items():
    print(f"Value counts for {column}:")
    print(counts)
    print()

Value counts for Username:
ksmith             21
usmith             16
msmith             16
vmiller            15
ismith             13
                   ..
jessica57           1
ggraham             1
john93              1
gallowaymichael     1
daniel29            1
Name: Username, Length: 40566, dtype: int64

Value counts for Tweet:
Station activity person against natural majority none few size expect six marriage.        1
Institution second billion over song either arm.                                           1
However plan meeting certain dinner card produce wear whether give hour something.         1
Total least today until clear nearly economy book single with successful.                  1
Full likely beautiful example partner process top catch control natural lead push help.    1
                                                                                          ..
News society threat positive someone accept stand pressure life so describe pretty.        1
Station son

In [26]:
#identify inconsistent formatting for textual(categorical too) columns

print(df['Hashtags'].value_counts())

No Hashtag                            8341
area                                    21
big                                     20
treat                                   19
ground                                  18
                                      ... 
president conference field process       1
market live mouth sit wide               1
your five                                1
serious not Democrat                     1
onto admit artist first                  1
Name: Hashtags, Length: 34248, dtype: int64


Exploratory Data Analysis

In [27]:
# Filter data based on bot label
bot_data = df[df['Bot Label'] == 1]
non_bot_data = df[df['Bot Label'] == 0]

In [28]:
# Calculate statistics for followers, retweets, and mentions
bot_followers_stats = bot_data['Follower Count'].describe()
non_bot_followers_stats = non_bot_data['Follower Count'].describe()

bot_retweet_stats = bot_data['Retweet Count'].describe()
non_bot_retweet_stats = non_bot_data['Retweet Count'].describe()

bot_mention_stats = bot_data['Mention Count'].describe()
non_bot_mention_stats = non_bot_data['Mention Count'].describe()

#Display Statistics
print("Follower Count Statistics:")
print("Bot Accounts:")
print(bot_followers_stats)
print("\nNon-Bot Accounts:")
print(non_bot_followers_stats)

print("\nRetweet Count Statistics:")
print("Bot Accounts:")
print(bot_retweet_stats)
print("\nNon-Bot Accounts:")
print(non_bot_retweet_stats)

print("\nMention Count Statistics:")
print("Bot Accounts:")
print(bot_mention_stats)
print("\nNon-Bot Accounts:")
print(non_bot_mention_stats)

Follower Count Statistics:
Bot Accounts:
count    25018.000000
mean      4991.944280
std       2876.289818
min          0.000000
25%       2497.000000
50%       4978.000000
75%       7468.000000
max      10000.000000
Name: Follower Count, dtype: float64

Non-Bot Accounts:
count    24982.000000
mean      4985.255664
std       2881.251104
min          0.000000
25%       2480.250000
50%       5007.500000
75%       7472.000000
max      10000.000000
Name: Follower Count, dtype: float64

Retweet Count Statistics:
Bot Accounts:
count    25018.000000
mean        50.042050
std         29.171048
min          0.000000
25%         25.000000
50%         50.000000
75%         75.000000
max        100.000000
Name: Retweet Count, dtype: float64

Non-Bot Accounts:
count    24982.000000
mean        49.969098
std         29.191822
min          0.000000
25%         25.000000
50%         50.000000
75%         75.000000
max        100.000000
Name: Retweet Count, dtype: float64

Mention Count Statistics:
Bot

Label Encoding

In [29]:
#label encoding of boolean data
from sklearn.preprocessing import LabelEncoder
#Uses label encoding to convert boolean data in the 'Verified' column to numerical format.
label_encoder=LabelEncoder()
df['Verified']=label_encoder.fit_transform(df['Verified'])
df

Unnamed: 0,User ID,Username,Tweet,Retweet Count,Mention Count,Follower Count,Verified,Bot Label,Location,Created At,Hashtags,cleaned_tweet
0,132131,flong,Station activity person against natural majori...,85,1,2353,0,1,Adkinston,2020-05-11 15:29:50,No Hashtag,station activity person against natural majori...
1,289683,hinesstephanie,Authority research natural life material staff...,55,5,9617,1,0,Sanderston,2022-11-26 05:18:10,both live,authority research natural life material staff...
2,779715,roberttran,Manage whose quickly especially foot none to g...,6,2,4363,1,0,Harrisonfurt,2022-08-08 03:16:54,phone ahead,manage whose quickly especially foot none to g...
3,696168,pmason,Just cover eight opportunity strong policy which.,54,5,2242,1,1,Martinezberg,2021-08-14 22:27:05,ever quickly new I,just cover eight opportunity strong policy which
4,704441,noah87,Animal sign six data good or.,26,3,8438,0,1,Camachoville,2020-04-13 21:24:21,foreign mention,animal sign six data good or
...,...,...,...,...,...,...,...,...,...,...,...,...
49995,491196,uberg,Want but put card direction know miss former h...,64,0,9911,1,1,Lake Kimberlyburgh,2023-04-20 11:06:26,teach quality ten education any,want but put card direction know miss former half
49996,739297,jessicamunoz,Provide whole maybe agree church respond most ...,18,5,9900,0,1,Greenbury,2022-10-18 03:57:35,add walk among believe,provide whole maybe agree church respond most ...
49997,674475,lynncunningham,Bring different everyone international capital...,43,3,6313,1,1,Deborahfort,2020-07-08 03:54:08,onto admit artist first,bring different everyone international capital...
49998,167081,richardthompson,Than about single generation itself seek sell ...,45,1,6343,0,0,Stephenside,2022-03-22 12:13:44,star,than about single generation itself seek sell ...


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   User ID         50000 non-null  int64         
 1   Username        50000 non-null  object        
 2   Tweet           50000 non-null  object        
 3   Retweet Count   50000 non-null  int64         
 4   Mention Count   50000 non-null  int64         
 5   Follower Count  50000 non-null  int64         
 6   Verified        50000 non-null  int64         
 7   Bot Label       50000 non-null  int64         
 8   Location        50000 non-null  object        
 9   Created At      50000 non-null  datetime64[ns]
 10  Hashtags        50000 non-null  object        
 11  cleaned_tweet   50000 non-null  object        
dtypes: datetime64[ns](1), int64(6), object(5)
memory usage: 4.6+ MB


Text Vectorization using TF-IDF

In [31]:
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
# Combine all text columns into a single Series
text_data = df['Tweet'] + ' ' + df['Username'] + ' ' + df['Hashtags']+' '+df['Location']

# Text vectorization using TF-IDF (sparse representation)
vectorizer = TfidfVectorizer()
text_sparse = vectorizer.fit_transform(text_data)

In [32]:
# Combine text features with additional features

additional_features = df[['Retweet Count', 'Verified','Mention Count','Follower Count','Created At']]

In [33]:
additional_features['Created At'] = additional_features['Created At'].astype(int)  # Convert to Unix timestamp
additional_features = additional_features.astype('float64')  # Convert to float64
text_sparse = text_sparse.astype('float64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  additional_features['Created At'] = additional_features['Created At'].astype(int)  # Convert to Unix timestamp


In [34]:
combined_sparse = hstack((text_sparse, additional_features))

Random Forest Classifier

In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(combined_sparse, df['Bot Label'], test_size=0.2, random_state=42)

# Create an instance of the Random Forest classifier
rf_classifier = RandomForestClassifier()

# Train the Random Forest classifier
rf_classifier.fit(X_train, y_train)

In [36]:
# Make predictions on the testing data
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
from sklearn.metrics import accuracy_score, classification_report

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")

Accuracy: 0.4938
Classification Report:
              precision    recall  f1-score   support

           0       0.49      0.53      0.51      4968
           1       0.50      0.46      0.48      5032

    accuracy                           0.49     10000
   macro avg       0.49      0.49      0.49     10000
weighted avg       0.49      0.49      0.49     10000



Prediction for New Data

In [37]:
# Example: Predict labels for new data
from datetime import datetime

# New data (example)
new_data = pd.DataFrame({

    'Tweet': ['Just cover eight opportunity strong policy which.'],

    'Username': ['pmason'],

    'Hashtags': ['neever quickly new Iw'],

    'Retweet Count': [54],

    'Verified': [1],

    'Location' : ['Martinezberg'],

    'Mention Count' : [5],

    'Follower Count' : [2242],

    'Created At' : ['14-08-2021  22:27:00']
})




# Concatenate text features

new_text_data = new_data['Tweet'] + ' ' + new_data['Username'] + ' ' + new_data['Hashtags']+' '+new_data['Location']




# Perform feature vectorization on new text features

new_text_sparse = vectorizer.transform(new_text_data)




# Combine new text features with additional features

new_additional_features = new_data[['Retweet Count', 'Verified','Mention Count','Follower Count','Created At']]

new_additional_features['Created At'] = pd.to_datetime(new_additional_features['Created At'])
new_additional_features['Created At'] = new_additional_features['Created At'].apply(lambda x: int(datetime.timestamp(x)))

# new_additional_features = new_additional_features.astype('float64')


# new_additional_features['Created At'] = new_additional_features['Created At'].astype(int)  # Convert to Unix timestamp
new_additional_features = new_additional_features.astype('float64')

new_combined_sparse = hstack((new_text_sparse, new_additional_features))




# Predict labels for the new data

new_predictions = rf_classifier.predict(new_combined_sparse)


# Print the predictions

for i, prediction in enumerate(new_predictions):

    if prediction == 1:

        print(f"Data point {i+1}: Bot")

    else:

        print(f"Data point {i+1}: Not Bot")

Data point 1: Bot


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_additional_features['Created At'] = pd.to_datetime(new_additional_features['Created At'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_additional_features['Created At'] = new_additional_features['Created At'].apply(lambda x: int(datetime.timestamp(x)))
