Fae News Detection

In [1]:
!pip install scikit-learn pandas joblib



In [3]:
# Step 2: Load the dataset
import pandas as pd

fake_df = pd.read_csv('/content/Fake.csv')
real_df = pd.read_csv('/content/True.csv')

fake_df['label'] = 0  # fake
real_df['label'] = 1  # real

# Step 3: Combine & prepare
df = pd.concat([fake_df, real_df], axis=0).sample(frac=1).reset_index(drop=True)
df['content'] = df['title'] + " " + df['text']
df = df[['content', 'label']]

In [4]:
# Step 4: Vectorize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test, y_train, y_test = train_test_split(df['content'], df['label'], test_size=0.2, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [5]:
# Step 5: Train classifier
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score

model = PassiveAggressiveClassifier(max_iter=50)
model.fit(X_train_vec, y_train)

preds = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, preds))

Accuracy: 0.9944320712694877


In [7]:
# Step 6: Save model + vectorizer
import joblib
joblib.dump(model, "fake_news_model.pkl")
joblib.dump(vectorizer, "vectorizer.pkl")

['vectorizer.pkl']

Deepfake Detection

In [8]:
from google.colab import files
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"reyaoberoi2005","key":"5d1198e29869a78f461d887eca44be53"}'}

In [9]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [11]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("sanikatiwarekar/deep-fake-detection-dfd-entire-original-dataset")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/deep-fake-detection-dfd-entire-original-dataset


In [19]:
!ls /kaggle/input/deep-fake-detection-dfd-entire-original-dataset

 DFD_manipulated_sequences  'DFD_original sequences'


In [14]:
original_path = "/kaggle/input/deep-fake-detection-dfd-entire-original-dataset/DFD_original sequences"
manipulated_path = "/kaggle/input/deep-fake-detection-dfd-entire-original-dataset/DFD_manipulated_sequences/DFD_manipulated_sequences"

In [20]:
import os

# List the root directory contents
print("Root contents:", os.listdir(path))

# Check original and manipulated folders if they exist
orig_path = os.path.join(path, "DFD_original sequences")
manip_path = os.path.join(path, "DFD_manipulated_sequences", "DFD_manipulated_sequences")

print("Original:", os.listdir(orig_path) if os.path.exists(orig_path) else "Not found")
print("Manipulated:", os.listdir(manip_path) if os.path.exists(manip_path) else "Not found")

Root contents: ['DFD_original sequences', 'DFD_manipulated_sequences']
Original: ['26__walking_down_street_outside_angry.mp4', '08__talking_against_wall.mp4', '14__walking_down_indoor_hall_disgust.mp4', '08__walking_down_street_outside_angry.mp4', '05__outside_talking_still_laughing.mp4', '14__exit_phone_room.mp4', '06__walk_down_hall_angry.mp4', '12__outside_talking_still_laughing.mp4', '02__talking_against_wall.mp4', '01__talking_against_wall.mp4', '24__outside_talking_still_laughing.mp4', '19__exit_phone_room.mp4', '03__hugging_happy.mp4', '12__walking_down_indoor_hall_disgust.mp4', '05__walk_down_hall_angry.mp4', '16__kitchen_pan.mp4', '18__walking_down_street_outside_angry.mp4', '16__walking_down_indoor_hall_disgust.mp4', '12__podium_speech_happy.mp4', '08__outside_talking_still_laughing.mp4', '03__secret_conversation.mp4', '27__kitchen_pan.mp4', '16__exit_phone_room.mp4', '11__talking_against_wall.mp4', '09__walk_down_hall_angry.mp4', '13__outside_talking_pan_laughing.mp4', '14__

In [21]:
from glob import glob
from collections import Counter

def list_extensions(folder):
    files = glob(os.path.join(folder, "**", "*.*"), recursive=True)
    return Counter([os.path.splitext(f)[-1].lower() for f in files])

print("Original extensions:", list_extensions(orig_path))
print("Manipulated extensions:", list_extensions(manip_path))

Original extensions: Counter({'.mp4': 363})
Manipulated extensions: Counter({'.mp4': 3068})


In [23]:
def extract_frames_from_videos(input_folder, output_folder, label, every_n_seconds=1, max_videos=200):
    os.makedirs(output_folder, exist_ok=True)

    video_counter = 0

    for root, _, files in os.walk(input_folder):
        for file in sorted(files):  # sort for reproducibility
            if file.endswith(".mp4"):
                if video_counter >= max_videos:
                    break

                video_path = os.path.join(root, file)
                cap = cv2.VideoCapture(video_path)
                fps = cap.get(cv2.CAP_PROP_FPS)
                frame_interval = int(fps * every_n_seconds)

                frame_count = 0
                saved_count = 0
                success = True

                while success:
                    success, frame = cap.read()
                    if not success:
                        break
                    if frame_count % frame_interval == 0:
                        frame_name = f"{label}_{file[:-4]}_frame{frame_count}.jpg"
                        save_path = os.path.join(output_folder, frame_name)
                        cv2.imwrite(save_path, frame)
                        saved_count += 1
                    frame_count += 1

                cap.release()
                print(f"[{label.upper()}] Extracted {saved_count} frames from {file}")
                video_counter += 1

In [25]:
extract_frames_from_videos(real_video_folder, "data/real", label="real", every_n_seconds=1, max_videos=50)
extract_frames_from_videos(fake_video_folder, "data/fake", label="fake", every_n_seconds=1, max_videos=50)

[REAL] Extracted 13 frames from 01__exit_phone_room.mp4
[REAL] Extracted 33 frames from 01__hugging_happy.mp4
[REAL] Extracted 24 frames from 01__kitchen_pan.mp4
[REAL] Extracted 34 frames from 01__kitchen_still.mp4
[REAL] Extracted 44 frames from 01__meeting_serious.mp4
[REAL] Extracted 27 frames from 01__outside_talking_pan_laughing.mp4
[REAL] Extracted 36 frames from 01__outside_talking_still_laughing.mp4
[REAL] Extracted 38 frames from 01__podium_speech_happy.mp4
[REAL] Extracted 41 frames from 01__secret_conversation.mp4
[REAL] Extracted 36 frames from 01__talking_against_wall.mp4
[REAL] Extracted 63 frames from 01__talking_angry_couch.mp4
[REAL] Extracted 16 frames from 01__walk_down_hall_angry.mp4
[REAL] Extracted 46 frames from 01__walking_and_outside_surprised.mp4
[REAL] Extracted 35 frames from 01__walking_down_indoor_hall_disgust.mp4
[REAL] Extracted 26 frames from 01__walking_down_street_outside_angry.mp4
[REAL] Extracted 13 frames from 01__walking_outside_cafe_disgusted.mp

In [27]:
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from tensorflow.keras.optimizers import Adam
import os

# Paths and parameters
data_dir = "data"
img_size = (224, 224)
batch_size = 32

# Image augmentation
datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True
)

train_gen = datagen.flow_from_directory(
    data_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode='binary',
    subset='training',
    shuffle=True
)

val_gen = datagen.flow_from_directory(
    data_dir,
    target_size=img_size,
    batch_size=batch_size,
    class_mode='binary',
    subset='validation'
)

# Load MobileNetV2 as base model
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze base model
base_model.trainable = False

# Custom head
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.3)(x)
x = Dense(64, activation='relu')(x)
x = Dropout(0.3)(x)
predictions = Dense(1, activation='sigmoid')(x)

model = Model(inputs=base_model.input, outputs=predictions)

# Compile
model.compile(optimizer=Adam(learning_rate=1e-4), loss='binary_crossentropy', metrics=['accuracy'])

# Train
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=3  # You can increase later
)

# Save model
model.save("deepfake_detector.h5")

Found 4181 images belonging to 2 classes.
Found 1044 images belonging to 2 classes.
Epoch 1/3
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m324s[0m 2s/step - accuracy: 0.7424 - loss: 0.5394 - val_accuracy: 0.6111 - val_loss: 0.7034
Epoch 2/3
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m329s[0m 3s/step - accuracy: 0.8579 - loss: 0.3510 - val_accuracy: 0.5364 - val_loss: 0.8988
Epoch 3/3
[1m131/131[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m321s[0m 2s/step - accuracy: 0.9076 - loss: 0.2548 - val_accuracy: 0.5402 - val_loss: 0.9662


