In [87]:
# importing the libraries
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
plt.style.use('fivethirtyeight')

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.metrics import classification_report, roc_auc_score
from transformers import BertTokenizer, BertModel
import torch
from tqdm import tqdm


In [11]:
#reading the fake data from csv
df = pd.read_csv("fake.csv")

In [12]:
#homogenising the data
df['Is_Private'] = df['Is_Private'].astype(str).str.lower().map({'true': 1, 'false': 0, 'yes': 1, 'no': 0}).fillna(0).astype(int)
df['Real'] = df['Real'].astype(str).str.lower().map({'true': 1, 'false': 0, 'yes': 1, 'no': 0}).fillna(0).astype(int)

In [13]:
df

Unnamed: 0,Username,Full_Name,Bio,Followers,Following,numof_Posts,Is_Private,Real
0,mastercoolthebest51,Master Cool,Building my empire one post at a time.,59,4120,6,1,0
1,cool69854,Cool,"Catch flights, not feelings. fakepromo.com",87,4344,18,0,0
2,masterluv283,Master,"Exploring life, one selfie at a time.",64,2236,19,0,0
3,ultrathebest72,Ultra,Just another internet wanderer. linktr.ee/fake,27,4942,6,1,0
4,queen12384,Queen,"Here for a good time, not a long time.",43,4520,13,0,0
...,...,...,...,...,...,...,...,...
227,brodriguez589,Kelli Mitchell,"Dream big, act louder 🎬✨",1318,39,21,0,0
228,wmurphy275,Alexandra Sutton,"Luxury living, even if it’s rented 😎🏝️",689,69,64,0,0
229,hochristopher605,Sarah Mills,"Volume up, worries down 🎵🔊",799,682,56,0,0
230,thompsondarius317,Chris Richardson,100% genuine fake follower 😅📉,2023,38,70,0,0


In [17]:
#creating a copy for manipulation
df_nlp=df.copy()
df_nlp

Unnamed: 0,Username,Full_Name,Bio,Followers,Following,numof_Posts,Is_Private,Real
0,mastercoolthebest51,Master Cool,Building my empire one post at a time.,59,4120,6,1,0
1,cool69854,Cool,"Catch flights, not feelings. fakepromo.com",87,4344,18,0,0
2,masterluv283,Master,"Exploring life, one selfie at a time.",64,2236,19,0,0
3,ultrathebest72,Ultra,Just another internet wanderer. linktr.ee/fake,27,4942,6,1,0
4,queen12384,Queen,"Here for a good time, not a long time.",43,4520,13,0,0
...,...,...,...,...,...,...,...,...
227,brodriguez589,Kelli Mitchell,"Dream big, act louder 🎬✨",1318,39,21,0,0
228,wmurphy275,Alexandra Sutton,"Luxury living, even if it’s rented 😎🏝️",689,69,64,0,0
229,hochristopher605,Sarah Mills,"Volume up, worries down 🎵🔊",799,682,56,0,0
230,thompsondarius317,Chris Richardson,100% genuine fake follower 😅📉,2023,38,70,0,0


In [23]:
#dropping not useful columns
df_nlp.drop('Followers',axis=1,inplace=True)
df_nlp.drop('Following',axis=1,inplace=True)
df_nlp.drop('numof_Posts',axis=1,inplace=True)
df_nlp.drop('Is_Private',axis=1,inplace=True)

In [24]:
df_nlp

Unnamed: 0,Username,Full_Name,Bio,Real
0,mastercoolthebest51,Master Cool,Building my empire one post at a time.,0
1,cool69854,Cool,"Catch flights, not feelings. fakepromo.com",0
2,masterluv283,Master,"Exploring life, one selfie at a time.",0
3,ultrathebest72,Ultra,Just another internet wanderer. linktr.ee/fake,0
4,queen12384,Queen,"Here for a good time, not a long time.",0
...,...,...,...,...
227,brodriguez589,Kelli Mitchell,"Dream big, act louder 🎬✨",0
228,wmurphy275,Alexandra Sutton,"Luxury living, even if it’s rented 😎🏝️",0
229,hochristopher605,Sarah Mills,"Volume up, worries down 🎵🔊",0
230,thompsondarius317,Chris Richardson,100% genuine fake follower 😅📉,0


In [33]:
#doing the same procedure for real data 
df2= pd.read_csv("real.csv")

In [34]:
df2['Is_Private'] = df2['Is_Private'].astype(str).str.lower().map({'true': 1, 'false': 0, 'yes': 1, 'no': 0}).fillna(0).astype(int)


In [35]:
df2_nlp= df2.copy()

In [36]:
df2_nlp.drop('Followers',axis=1,inplace=True)
df2_nlp.drop('Following',axis=1,inplace=True)
df2_nlp.drop('numof_Posts',axis=1,inplace=True)
df2_nlp.drop('Is_Private',axis=1,inplace=True)

In [37]:
df2_nlp

Unnamed: 0,Username,Full_Name,Bio,Real
0,yowai_mew,totoro,it is what it is *throws up blood*,1.0
1,sam_ambastha,Samiksha Ambastha,-We gon' be alright,1.0
2,alisha_waghmare_,Alishawaghmare,Nagpur | Pune📍,1.0
3,shm.wii._.k.ri,Shambhawi,,1.0
4,yashvi.exe,yashvi raghuvanshi,IITG'27\nprofessional fun haver,1.0
...,...,...,...,...
239,sushruth__03_,SUSHRUT B.,IIT BHU '24\n-A Fun time in hell awaits.,1.0
240,nimishasingh19,Nimisha Singh,My instagram self is a little unhinged 🎀\nIIT ...,1.0
241,ananyabajpayi4,Ananya Bajpayi,,1.0
242,adityaujjwal,Aditya Ujjwal,"IIT BHU '24 | ECE\nfluent in sarcasm, expert i...",1.0


In [38]:
#combining the datasets to create a mixed data
cdf = pd.concat([df_nlp, df2_nlp], ignore_index=True)
cdf = cdf.sample(frac=1).reset_index(drop=True)

In [39]:
cdf

Unnamed: 0,Username,Full_Name,Bio,Real
0,zjordan71,Richard Johnson,Manga stacks over money stacks 📚💴,0.0
1,_artisticbends_,Sneha,"It's not what you look at that matters,\nIt's ...",1.0
2,matthew_wilkinson8549,Matthew Wilkinson,Within Congress agreement worry including kid ...,0.0
3,fitness_lot396,Jason Coleman,Sweat. Smile. Repeat 💪🏼🏋️ #LoveIt,0.0
4,subhashree_kedia,Subhashree Kedia,"IITGWT’27 \nCuttack | Guwahati \nChaotic mind,...",1.0
...,...,...,...,...
471,mrinayakdas,Mrinayak Das,,1.0
472,javier_rodriguez9123,Javier Rodriguez,Drive blood her writer politics course.,0.0
473,clinejoseph280,Victoria Green,Color palettes for your soul 🎭🌈,0.0
474,crazydarkvip905,Crazy Dark,"Click the link, change your life. bit.ly/fakepage",0.0


In [40]:
# storing the combined-shuffeled data into a csv
cdf.to_csv("combined_data.csv", index=False)

In [54]:
#further cleaning of data
cdf = cdf.fillna("")
cdf= cdf[cdf['Real'].astype(str).str.strip() != '']
cdf['Real'] = cdf['Real'].astype(float).astype(int)

In [55]:
#using bert as a feature extractor
#create a complete sentence to be fed into tokenizer
def combine_text(row):
    return f"Full Name: {row['Full_Name']} [SEP] Username: {row['Username']} [SEP] Bio: {row['Bio']}"

In [56]:
#using bert tokenizer
cdf['bert_input'] = cdf.apply(combine_text, axis=1)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')
model.eval()

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [57]:
#generating and storing the embeddings of data
def get_bert_embedding(text):
    with torch.no_grad():
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
        outputs = model(**inputs)
        cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().numpy()  # [CLS] token
        
    return cls_embedding
    
bert_embeddings = []
for text in tqdm(cdf['bert_input'], desc="Embedding with BERT"):
    emb = get_bert_embedding(text)
    bert_embeddings.append(emb)

Embedding with BERT: 100%|███████████████████████████████████████████████████████████| 475/475 [00:34<00:00, 13.92it/s]


In [58]:
#splitting the data into test and train set
X = np.array(bert_embeddings)
y = cdf['Real'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [59]:
#training the model
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

In [60]:
#running the model on test data
y_pred = rf.predict(X_test)
y_proba = rf.predict_proba(X_test)[:, 1]

In [61]:
#performance metrics
print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")


✅ Classification Report:
              precision    recall  f1-score   support

           0       0.93      1.00      0.96        54
           1       1.00      0.90      0.95        41

    accuracy                           0.96        95
   macro avg       0.97      0.95      0.96        95
weighted avg       0.96      0.96      0.96        95

ROC AUC Score: 0.9948


In [62]:
#saving the model for future use
import joblib
joblib.dump(rf, "random_forest_model.pkl")
np.save("bert_features.npy", X)  # Save features if needed

In [71]:
#using tdfvectorize + logistic regression
def combine_text(row):
    return f"Full Name: {row['Full_Name']} Username: {row['Username']} Bio: {row['Bio']}"

cdf['combined_text'] = cdf.apply(combine_text, axis=1)

# Train/Test split
X = cdf['combined_text']
y = cdf['Real'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [72]:
#creating a pipeline 
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=1000, ngram_range=(1, 2))),
    ("clf", LogisticRegression(solver='liblinear', random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)


In [74]:
#running predictions on the test data
y_pred = pipeline.predict(X_test)
y_proba = pipeline.predict_proba(X_test)[:, 1]

In [75]:
#evaluation metrics
print("\n✅ Classification Report:")
print(classification_report(y_test, y_pred))
print(f"ROC AUC Score: {roc_auc_score(y_test, y_proba):.4f}")


✅ Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.78      0.85        54
           1       0.76      0.93      0.84        41

    accuracy                           0.84        95
   macro avg       0.85      0.85      0.84        95
weighted avg       0.86      0.84      0.84        95

ROC AUC Score: 0.9512


In [76]:
#saving models for future use
import joblib
joblib.dump(pipeline, "tfidf_logreg_model.pkl")

['tfidf_logreg_model.pkl']