In [187]:
!pip install better-profanity


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [188]:
import pandas as pd
import time
import random
import sys
import os
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
# Import necessary libraries
import nltk
from better_profanity import profanity  # Using better_profanity for profanity check
import textstat
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import joblib

## Step 1: Load the Saved Model

In [189]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Specify the path to the saved model
model_folder = "/Users/shusritavenugopal/Documents/MSIS/FALL2024/InformationStorageAndRetrieval/video_classification_project/saved_BERT_pretrained_models/model_20241211_133348_v839069"

# Load the model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(model_folder)
tokenizer = AutoTokenizer.from_pretrained(model_folder)

## Step 2: Load the Sample for Inference

In [190]:
sample = "../data/raw/video_information_with_transcripts_network.xlsx"
sample_df = pd.read_excel(sample, engine='openpyxl')  # Existing file

## Step 2.1: Delete Duplicates

In [191]:
# Check for duplicates
print("Number of duplicates:", sample_df.duplicated().sum())

Number of duplicates: 0


In [192]:
# Remove duplicates
sample_df = sample_df.drop_duplicates()

# Confirm duplicates are removed
print("Number of duplicates after cleaning:", sample_df.duplicated().sum())

Number of duplicates after cleaning: 0


In [193]:
sample_df.columns

Index(['Video ID', 'Title', 'Description', 'Published At', 'Channel Title',
       'Tags', 'Category ID', 'Category Label', 'Default Audio Language',
       'Transcript'],
      dtype='object')

In [194]:
# Rename specific columns
sample_df = sample_df.rename(columns={
    "Video ID": "video_id",
    "Title": "title",
    "Description": "description",
    'Published At': "published_at",
    'Channel Title': "channel_title",
    'Tags': "tags",
    'Category ID': "category_id",
    'Category Label': "category_label",
    'Default Audio Language': "default_audio_language",
    'Transcript': "transcript"
})

print(sample_df.head())

      video_id                                              title  \
0  ju4KQT0wL0I                                      All The Stars   
1  7YaqzpitBXw            What are MLPs (Multilayer Perceptrons)?   
2  ZQbWWOzvyfo                                            Verizon   
3  u4gEBRSKi2E  Is Being Fat A Choice? Fit Women vs Fat Women ...   
4  hCcwCv3G1FQ                                         St. Chroma   

                                         description          published_at  \
0  Provided to YouTube by Universal Music Group\n...  2018-12-12T09:55:47Z   
1  Learn about watsonx: https://ibm.biz/BdvxRg\n\...  2022-07-11T12:00:07Z   
2  Switch to Fios Home Internet w/ no hidden fees...  2024-07-03T02:03:05Z   
3  Got injured in an accident? You could be one  ...  2024-10-16T16:01:03Z   
4  Provided to YouTube by Columbia\n\nSt. Chroma ...  2024-10-28T10:02:12Z   

                channel_title  \
0      Kendrick Lamar - Topic   
1              IBM Technology   
2                

## Step 2.2: Construct the necessary column

In [195]:
# Concatenate relevant fields into a dictionary-like structure for analysis
def get_text_dict(row):
    return {
        "title": row.get("title", ""),
        "description": row.get("description", ""),
        "transcript": row.get("transcript", ""),
        "category_label": row.get("category_label", ""),
        "channel_title": row.get("channel_title", ""),
    }

# Create a dictionary from text fields for analysis
sample_df["text_dict"] = sample_df.apply(get_text_dict, axis=1)
# Concatenate all text fields into a single string, handling missing values
sample_df["combined_text"] = sample_df["text_dict"].apply(lambda d: " ".join(str(v) for v in d.values() if pd.notnull(v)))

sample_df.head(5)

Unnamed: 0,video_id,title,description,published_at,channel_title,tags,category_id,category_label,default_audio_language,transcript,text_dict,combined_text
0,ju4KQT0wL0I,All The Stars,Provided to YouTube by Universal Music Group\n...,2018-12-12T09:55:47Z,Kendrick Lamar - Topic,"Kendrick Lamar, ã‚±ãƒ³ãƒ‰ãƒªãƒƒã‚¯ãƒ©ãƒžãƒ¼, ã‚±ãƒ³ãƒ‰ãƒªãƒƒã‚¯ãƒ»ãƒ©ãƒžãƒ¼, SZA, ã‚·ã‚¶...",10,Music,,Transcript not available,"{'title': 'All The Stars', 'description': 'Pro...",All The Stars Provided to YouTube by Universal...
1,7YaqzpitBXw,What are MLPs (Multilayer Perceptrons)?,Learn about watsonx: https://ibm.biz/BdvxRg\n\...,2022-07-11T12:00:07Z,IBM Technology,"IBM, IBM Cloud, AI, Artificial Intelligence, M...",27,Education,en-US,You've probably heard of AI that can do really...,{'title': 'What are MLPs (Multilayer Perceptro...,What are MLPs (Multilayer Perceptrons)? Learn ...
2,ZQbWWOzvyfo,Verizon,Switch to Fios Home Internet w/ no hidden fees...,2024-07-03T02:03:05Z,Verizon,,28,Science & Technology,en-US,[Music] my internet my rules introducing my ho...,"{'title': 'Verizon', 'description': 'Switch to...",Verizon Switch to Fios Home Internet w/ no hid...
3,u4gEBRSKi2E,Is Being Fat A Choice? Fit Women vs Fat Women ...,Got injured in an accident? You could be one ...,2024-10-16T16:01:03Z,Jubilee,"jubilee, jubilee media, jubilee project, middl...",24,Entertainment,en,if you see me you consider me obese I wouldn't...,{'title': 'Is Being Fat A Choice? Fit Women vs...,Is Being Fat A Choice? Fit Women vs Fat Women ...
4,hCcwCv3G1FQ,St. Chroma,Provided to YouTube by Columbia\n\nSt. Chroma ...,2024-10-28T10:02:12Z,"Tyler, The Creator - Topic","Tyler, The Creator Daniel Caesar CHROMAKOPIA S...",10,Music,,Transcript not available,"{'title': 'St. Chroma', 'description': 'Provid...",St. Chroma Provided to YouTube by Columbia\n\n...


In [196]:
sample_df.columns

Index(['video_id', 'title', 'description', 'published_at', 'channel_title',
       'tags', 'category_id', 'category_label', 'default_audio_language',
       'transcript', 'text_dict', 'combined_text'],
      dtype='object')

## Step 3: Preprocess the feature column (X)
### Text pre-processing

In [197]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize tools
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove special characters, numbers, and punctuation
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stop words and lemmatize tokens
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    
    # Rejoin tokens into a single string
    return ' '.join(tokens)

# Apply preprocessing to all text in X
sample_df["combined_text"] = sample_df["combined_text"].apply(preprocess_text)

# Check a few samples after preprocessing
print(sample_df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shusritavenugopal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/shusritavenugopal/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shusritavenugopal/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


      video_id                                              title  \
0  ju4KQT0wL0I                                      All The Stars   
1  7YaqzpitBXw            What are MLPs (Multilayer Perceptrons)?   
2  ZQbWWOzvyfo                                            Verizon   
3  u4gEBRSKi2E  Is Being Fat A Choice? Fit Women vs Fat Women ...   
4  hCcwCv3G1FQ                                         St. Chroma   

                                         description          published_at  \
0  Provided to YouTube by Universal Music Group\n...  2018-12-12T09:55:47Z   
1  Learn about watsonx: https://ibm.biz/BdvxRg\n\...  2022-07-11T12:00:07Z   
2  Switch to Fios Home Internet w/ no hidden fees...  2024-07-03T02:03:05Z   
3  Got injured in an accident? You could be one  ...  2024-10-16T16:01:03Z   
4  Provided to YouTube by Columbia\n\nSt. Chroma ...  2024-10-28T10:02:12Z   

                channel_title  \
0      Kendrick Lamar - Topic   
1              IBM Technology   
2                

## BERT Tokenizer

In [198]:
# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the 'combined_text' column from your sample_df
def tokenize_texts(texts, tokenizer, max_length=512):
    return tokenizer(
        texts.tolist(),           # Convert to list for batch processing
        padding=True,             # Pad sequences to the max length
        truncation=True,          # Truncate longer texts
        max_length=max_length,   # Maximum length for tokenized input
        return_tensors="pt"      # Return as pytorch tensors
    )

# Tokenize the 'combined_text' column from sample_df
encoded_inputs = tokenize_texts(sample_df["combined_text"], tokenizer)


In [199]:
# Convert to PyTorch Dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, tokenized_inputs):
        self.input_ids = tokenized_inputs['input_ids']
        self.attention_mask = tokenized_inputs['attention_mask']

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return {
            'input_ids': self.input_ids[idx],
            'attention_mask': self.attention_mask[idx],
        }

# Prepare the dataset
predict_dataset = CustomDataset(encoded_inputs)

In [200]:
# Create DataLoader for batching
predict_dataloader = torch.utils.data.DataLoader(predict_dataset, batch_size=64)

# Make predictions without moving to a specific device
predictions = []
with torch.no_grad():
    for batch in predict_dataloader:
        # Get the input_ids and attention_mask from the batch
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']

        # Get model outputs
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        batch_predictions = torch.argmax(logits, dim=1)
        
        predictions.extend(batch_predictions.numpy())  # Collect predictions

# Add predictions to the DataFrame
sample_df["kids_safe_BERT"] = predictions

# Save the DataFrame with predictions to a new file
sample_df.to_excel("predicted_sample_df.xlsx", index=False)
print("Predictions saved to 'predicted_sample_df.xlsx'.")

Predictions saved to 'predicted_sample_df.xlsx'.


In [201]:
sample_df

Unnamed: 0,video_id,title,description,published_at,channel_title,tags,category_id,category_label,default_audio_language,transcript,text_dict,combined_text,kids_safe_BERT
0,ju4KQT0wL0I,All The Stars,Provided to YouTube by Universal Music Group\n...,2018-12-12T09:55:47Z,Kendrick Lamar - Topic,"Kendrick Lamar, ã‚±ãƒ³ãƒ‰ãƒªãƒƒã‚¯ãƒ©ãƒžãƒ¼, ã‚±ãƒ³ãƒ‰ãƒªãƒƒã‚¯ãƒ»ãƒ©ãƒžãƒ¼, SZA, ã‚·ã‚¶...",10,Music,,Transcript not available,"{'title': 'All The Stars', 'description': 'Pro...",star provided youtube universal music group st...,0
1,7YaqzpitBXw,What are MLPs (Multilayer Perceptrons)?,Learn about watsonx: https://ibm.biz/BdvxRg\n\...,2022-07-11T12:00:07Z,IBM Technology,"IBM, IBM Cloud, AI, Artificial Intelligence, M...",27,Education,en-US,You've probably heard of AI that can do really...,{'title': 'What are MLPs (Multilayer Perceptro...,mlps multilayer perceptrons learn watsonx http...,0
2,ZQbWWOzvyfo,Verizon,Switch to Fios Home Internet w/ no hidden fees...,2024-07-03T02:03:05Z,Verizon,,28,Science & Technology,en-US,[Music] my internet my rules introducing my ho...,"{'title': 'Verizon', 'description': 'Switch to...",verizon switch fios home internet w hidden fee...,0
3,u4gEBRSKi2E,Is Being Fat A Choice? Fit Women vs Fat Women ...,Got injured in an accident? You could be one ...,2024-10-16T16:01:03Z,Jubilee,"jubilee, jubilee media, jubilee project, middl...",24,Entertainment,en,if you see me you consider me obese I wouldn't...,{'title': 'Is Being Fat A Choice? Fit Women vs...,fat choice fit woman v fat woman middle ground...,0
4,hCcwCv3G1FQ,St. Chroma,Provided to YouTube by Columbia\n\nSt. Chroma ...,2024-10-28T10:02:12Z,"Tyler, The Creator - Topic","Tyler, The Creator Daniel Caesar CHROMAKOPIA S...",10,Music,,Transcript not available,"{'title': 'St. Chroma', 'description': 'Provid...",st chroma provided youtube columbia st chroma ...,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2033,UuYKUgPQbDE,Subway Surfers The Animated Series | Rewind | ...,ðŸ“± Download Subway Surfers for FREE at: https:/...,2019-03-23T15:00:05Z,Subway Surfers,"subway surfers, subway surfers animated series...",1,Film & Animation,en-US,Transcript not available,{'title': 'Subway Surfers The Animated Series ...,subway surfer animated series rewind jake down...,0
2034,Labo4AdJS_c,Mr Bean Finds Ancient Gold | Mr Bean Animated ...,"Whilst digging for buried treasure, Mr Bean di...",2024-11-10T14:00:24Z,Mr Bean World,"Mr. Bean Cartoon World, funny cartoon, bean ca...",1,Film & Animation,en-GB,Transcript not available,{'title': 'Mr Bean Finds Ancient Gold | Mr Bea...,mr bean find ancient gold mr bean animated ful...,1
2035,5Kr886vhpcc,DELUSIONAL #animation #animationart #insideout...,,2024-08-08T18:29:49Z,Silent Jack,,24,Entertainment,,Transcript not available,{'title': 'DELUSIONAL #animation #animationart...,delusional animation animationart insideout in...,0
2036,T_evwI929vU,The New Norm: AWFUL Anti-Woke Cartoon,Twitter just unveiled their brand new â€œSouth P...,2024-06-28T01:54:26Z,LS Mark,,24,Entertainment,en-GB,Transcript not available,{'title': 'The New Norm: AWFUL Anti-Woke Carto...,new norm awful antiwoke cartoon twitter unveil...,0


In [202]:
# Sample function definitions

# Check for profanity using the better_profanity library
def check_profanity(text):
    return profanity.contains_profanity(text)

def check_themes(text):
    sensitive_keywords = [
        "violence", "drugs", "sex", "explicit", "alcohol", "bomb", "gun", "fuck", 
        "abuse", "die", "kill", "unalive", "poison", "damn", "ghost", "scary", 
        "horror", "blood", "dead", "intimacy", "suggestive", "couple", "controversial", 
        "hot", "injuries", "romantic", "vampire", "shit", "sexy", "battle", "girlfriend", 
        "boyfriend", "scene kiss", "intercourse", "creepy", "naked", 
        "murder", "assault", "war", "massacre", "brutality", "stabbing", "shooting", 
        "gore", "torture", "hostility", "crime", "cocaine", "heroin", "nicotine", 
        "vaping", "addiction", "overdose", "substance abuse", "pills", "intoxicated", 
        "nudity", "erotic", "pornography", "fetish", "sensual", "lust", "strip", 
        "adult", "escort", "provocative", "cleavage", "lingerie", "scream", 
        "haunted", "supernatural", "zombie", "demon", "devil", "satanic", "cursed", 
        "exorcism", "nightmare", "fear", "terror", "trauma", "bullying", "harassment", 
        "suicide", "self-harm", "torture", "exploitation", "manipulation", 
        "oppression", "bastard", "hell", "slut", "whore", "damnation", "crap", 
        "douche", "asshole", "piss", "racism", "sexism", "discrimination", 
        "genocide", "terrorism", "dictatorship", "oppression", "xenophobia", 
        "cult", "stunt", "choking", "prank", "fire", "explosion", "theft", 
        "crime", "arrest", "kidnapping", "hostage", "escape", "affair", "adultery", 
        "infidelity", "lustful", "intimate", "sensual", "crush", "love triangle", 
        "gambling", "casino", "money laundering", "trafficking", "cheating", 
        "virus", "hacking", "dark web"
    ]
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in sensitive_keywords)

# Check readability of the text using Flesch-Kincaid readability test
def check_readability(text):
    score = textstat.flesch_kincaid_grade(text)
    return score <= 5  # Grade level 5 is typically appropriate for kids

# Check sentiment tone using Vader sentiment analysis
def check_tone(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(text)["compound"]
    return sentiment_score > -0.2  # Positive tone

# Check for cultural sensitivity by looking for offensive phrases
def check_cultural_sensitivity(text):
    offensive_phrases = ["racist", "stereotype", "offensive"]
    text_lower = text.lower()
    return any(phrase in text_lower for phrase in offensive_phrases)

# Analyze the transcript for all criteria
def analyze_transcript(text):
    analysis_results = {
        "profanity": check_profanity(text),
        "themes": check_themes(text),
        "readability": check_readability(text),
        "tone": check_tone(text),
        "cultural_sensitivity": check_cultural_sensitivity(text)
    }
    return analysis_results

# Concatenate relevant fields into a dictionary-like structure for analysis
def get_text_dict(row):
    return {
        "title": row.get("title", ""),
        "description": row.get("description", ""),
        "transcript": row.get("transcript", ""),
        "category_label": row.get("Category Label", ""),
        "channel_title": row.get("channel_title", ""),
    }

# Analyze if content is safe based on keywords in channel title or tags
def is_kids_safe(row):
    # Safely extract and lower-case the channel title and tags
    channel_title = str(row.get("channel_title", "")).lower()  # Convert to string if not already
    tags = str(row.get("tags", "")).lower()  # Convert to string if not already
    
    # Check if any safe keyword exists in the channel title or tags
    if any(keyword.lower() in channel_title for keyword in safe_channels):
        return True
    if any(keyword.lower() in tags for keyword in safe_tags):
        return True
    return False

safe_channels = [
        "kids", "Cartoon Network India", "SpongeBob SquarePants Official", "WB kids", 
        "Masha and The Bear", "Peppa Pig - Official Channel", "Shaun the Sheep Official", 
        "Disney XD", "CARTOON AKG", "cartoon", "Videogyan Shows - Educational Videos For Kids", 
        "Bluey - Official Channel", "Disney Jr.", "Disney", "Mr Bean World", "Cartoonito", 
        "ariki Anime", "anime", "Nick Jr", "nursery", "rhymes", "cocomelon", "Dodo Kids", "Chewy"
    ]
    
safe_tags = [
        "learning", "toddler", "kids", "babies", "preschool", "family friendly", 
        "anime", "cartoon", "animation", "art", "drawing", "cute", "children", 
        "tom and jerry", "spongebob", "Scooby-Doo!", "Tom and Jerry", "Looney Tunes", 
        "Bugs Bunny", "spider-man", "phineas and ferb", "baby"
    ]


In [203]:
# Function to apply analysis to each row, considering all fields and specific keyword checks
def apply_analysis(df):
     # Perform analysis on the combined text
    df["profanity"] = df["combined_text"].apply(check_profanity)
    df["themes"] = df["combined_text"].apply(check_themes)
    df["readability"] = df["combined_text"].apply(check_readability)
    df["tone"] = df["combined_text"].apply(check_tone)
    df["cultural_sensitivity"] = df["combined_text"].apply(check_cultural_sensitivity)
    df["kids_safe_content"] = df.apply(is_kids_safe, axis=1)

    return df

# Apply the analysis functions to the DataFrame
sample_df = apply_analysis(sample_df)

In [204]:
sample_df.head(10)

Unnamed: 0,video_id,title,description,published_at,channel_title,tags,category_id,category_label,default_audio_language,transcript,text_dict,combined_text,kids_safe_BERT,profanity,themes,readability,tone,cultural_sensitivity,kids_safe_content
0,ju4KQT0wL0I,All The Stars,Provided to YouTube by Universal Music Group\n...,2018-12-12T09:55:47Z,Kendrick Lamar - Topic,"Kendrick Lamar, ã‚±ãƒ³ãƒ‰ãƒªãƒƒã‚¯ãƒ©ãƒžãƒ¼, ã‚±ãƒ³ãƒ‰ãƒªãƒƒã‚¯ãƒ»ãƒ©ãƒžãƒ¼, SZA, ã‚·ã‚¶...",10,Music,,Transcript not available,"{'title': 'All The Stars', 'description': 'Pro...",star provided youtube universal music group st...,0,False,False,False,True,False,False
1,7YaqzpitBXw,What are MLPs (Multilayer Perceptrons)?,Learn about watsonx: https://ibm.biz/BdvxRg\n\...,2022-07-11T12:00:07Z,IBM Technology,"IBM, IBM Cloud, AI, Artificial Intelligence, M...",27,Education,en-US,You've probably heard of AI that can do really...,{'title': 'What are MLPs (Multilayer Perceptro...,mlps multilayer perceptrons learn watsonx http...,0,False,True,False,True,False,True
2,ZQbWWOzvyfo,Verizon,Switch to Fios Home Internet w/ no hidden fees...,2024-07-03T02:03:05Z,Verizon,,28,Science & Technology,en-US,[Music] my internet my rules introducing my ho...,"{'title': 'Verizon', 'description': 'Switch to...",verizon switch fios home internet w hidden fee...,0,False,False,False,False,False,False
3,u4gEBRSKi2E,Is Being Fat A Choice? Fit Women vs Fat Women ...,Got injured in an accident? You could be one ...,2024-10-16T16:01:03Z,Jubilee,"jubilee, jubilee media, jubilee project, middl...",24,Entertainment,en,if you see me you consider me obese I wouldn't...,{'title': 'Is Being Fat A Choice? Fit Women vs...,fat choice fit woman v fat woman middle ground...,0,True,True,False,True,False,False
4,hCcwCv3G1FQ,St. Chroma,Provided to YouTube by Columbia\n\nSt. Chroma ...,2024-10-28T10:02:12Z,"Tyler, The Creator - Topic","Tyler, The Creator Daniel Caesar CHROMAKOPIA S...",10,Music,,Transcript not available,"{'title': 'St. Chroma', 'description': 'Provid...",st chroma provided youtube columbia st chroma ...,1,False,False,False,True,False,False
5,eMFuvGuDuBk,Be A Man.,#motivation #hopecore #\nEmbrace your masculin...,2024-04-30T17:33:59Z,Audentes Monarch,"Audentes Monarch, adrenaline, discipline, gym ...",22,People & Blogs,en-US,you're a man you are a man stand up and be a m...,"{'title': 'Be A Man.', 'description': '#motiva...",man motivation hopecore embrace masculinity le...,0,True,True,False,True,False,True
6,Ynu1FqBr3C4,QTMO2968100H_Q3 24_TMO Highlights_iPhone 16 Pr...,Subscribe to the T-Mobile YouTube channel:\nht...,2024-09-24T23:34:47Z,T-Mobile,"tmobile, t-mobile, t mobile, internet, wireles...",28,Science & Technology,en-US,check out Mahomes top three plays of the day h...,{'title': 'QTMO2968100H_Q3 24_TMO Highlights_i...,qtmohq tmo highlightsiphone pro usfamilies sav...,0,False,False,False,True,False,True
7,T5niXqwAnME,Big Muscle Guy Scars RELEASED #health #muscles...,Support the channel with a membership! \nMembe...,2024-06-14T13:30:15Z,Mondragon Chiropractic,"asmr, relax, satisfying",27,Education,en,I don't have to get rid of the muscle just the...,{'title': 'Big Muscle Guy Scars RELEASED #heal...,big muscle guy scar released health muscle kno...,0,False,True,False,True,False,False
8,EBGJuw6-2jE,Top gear x Osama bin ladenðŸ’€ðŸ’€ðŸ’€ #topgear #edit,,2023-11-03T08:23:14Z,Droopl,,22,People & Blogs,,here we are the cameras poised and ready and h...,{'title': 'Top gear x Osama bin ladenðŸ’€ðŸ’€ðŸ’€ #topg...,top gear x osama bin laden topgear edit camera...,0,False,False,False,True,False,False
9,mHZRFs-faX8,Cooking Curry During College Lecture!,Love you guys thank you so much for the suppor...,2024-10-13T02:00:02Z,Fique,"D'Aydrian Harding, Fred Beyer, Ash Alk, Zople,...",24,Entertainment,,[Music] so last video I asked you guys to get ...,{'title': 'Cooking Curry During College Lectur...,cooking curry college lecture love guy thank m...,0,True,True,False,True,False,False


In [205]:
# Drop rows where `kids_safe_BERT` == 1 or `kids_safe_content` == True
filtered_df = sample_df[~((sample_df["kids_safe_BERT"] == 1) | (sample_df["kids_safe_content"] == True))]

filtered_df.head()

Unnamed: 0,video_id,title,description,published_at,channel_title,tags,category_id,category_label,default_audio_language,transcript,text_dict,combined_text,kids_safe_BERT,profanity,themes,readability,tone,cultural_sensitivity,kids_safe_content
0,ju4KQT0wL0I,All The Stars,Provided to YouTube by Universal Music Group\n...,2018-12-12T09:55:47Z,Kendrick Lamar - Topic,"Kendrick Lamar, ã‚±ãƒ³ãƒ‰ãƒªãƒƒã‚¯ãƒ©ãƒžãƒ¼, ã‚±ãƒ³ãƒ‰ãƒªãƒƒã‚¯ãƒ»ãƒ©ãƒžãƒ¼, SZA, ã‚·ã‚¶...",10,Music,,Transcript not available,"{'title': 'All The Stars', 'description': 'Pro...",star provided youtube universal music group st...,0,False,False,False,True,False,False
2,ZQbWWOzvyfo,Verizon,Switch to Fios Home Internet w/ no hidden fees...,2024-07-03T02:03:05Z,Verizon,,28,Science & Technology,en-US,[Music] my internet my rules introducing my ho...,"{'title': 'Verizon', 'description': 'Switch to...",verizon switch fios home internet w hidden fee...,0,False,False,False,False,False,False
3,u4gEBRSKi2E,Is Being Fat A Choice? Fit Women vs Fat Women ...,Got injured in an accident? You could be one ...,2024-10-16T16:01:03Z,Jubilee,"jubilee, jubilee media, jubilee project, middl...",24,Entertainment,en,if you see me you consider me obese I wouldn't...,{'title': 'Is Being Fat A Choice? Fit Women vs...,fat choice fit woman v fat woman middle ground...,0,True,True,False,True,False,False
7,T5niXqwAnME,Big Muscle Guy Scars RELEASED #health #muscles...,Support the channel with a membership! \nMembe...,2024-06-14T13:30:15Z,Mondragon Chiropractic,"asmr, relax, satisfying",27,Education,en,I don't have to get rid of the muscle just the...,{'title': 'Big Muscle Guy Scars RELEASED #heal...,big muscle guy scar released health muscle kno...,0,False,True,False,True,False,False
8,EBGJuw6-2jE,Top gear x Osama bin ladenðŸ’€ðŸ’€ðŸ’€ #topgear #edit,,2023-11-03T08:23:14Z,Droopl,,22,People & Blogs,,here we are the cameras poised and ready and h...,{'title': 'Top gear x Osama bin ladenðŸ’€ðŸ’€ðŸ’€ #topg...,top gear x osama bin laden topgear edit camera...,0,False,False,False,True,False,False


In [206]:
# Drop rows where all the specified conditions are met
filtered_df = filtered_df[~(
    (sample_df["profanity"] == False) &
    (sample_df["themes"] == False) &
    (sample_df["tone"] == True) &
    (sample_df["cultural_sensitivity"] == False)
)]
filtered_df.head()

  filtered_df = filtered_df[~(


Unnamed: 0,video_id,title,description,published_at,channel_title,tags,category_id,category_label,default_audio_language,transcript,text_dict,combined_text,kids_safe_BERT,profanity,themes,readability,tone,cultural_sensitivity,kids_safe_content
2,ZQbWWOzvyfo,Verizon,Switch to Fios Home Internet w/ no hidden fees...,2024-07-03T02:03:05Z,Verizon,,28,Science & Technology,en-US,[Music] my internet my rules introducing my ho...,"{'title': 'Verizon', 'description': 'Switch to...",verizon switch fios home internet w hidden fee...,0,False,False,False,False,False,False
3,u4gEBRSKi2E,Is Being Fat A Choice? Fit Women vs Fat Women ...,Got injured in an accident? You could be one ...,2024-10-16T16:01:03Z,Jubilee,"jubilee, jubilee media, jubilee project, middl...",24,Entertainment,en,if you see me you consider me obese I wouldn't...,{'title': 'Is Being Fat A Choice? Fit Women vs...,fat choice fit woman v fat woman middle ground...,0,True,True,False,True,False,False
7,T5niXqwAnME,Big Muscle Guy Scars RELEASED #health #muscles...,Support the channel with a membership! \nMembe...,2024-06-14T13:30:15Z,Mondragon Chiropractic,"asmr, relax, satisfying",27,Education,en,I don't have to get rid of the muscle just the...,{'title': 'Big Muscle Guy Scars RELEASED #heal...,big muscle guy scar released health muscle kno...,0,False,True,False,True,False,False
9,mHZRFs-faX8,Cooking Curry During College Lecture!,Love you guys thank you so much for the suppor...,2024-10-13T02:00:02Z,Fique,"D'Aydrian Harding, Fred Beyer, Ash Alk, Zople,...",24,Entertainment,,[Music] so last video I asked you guys to get ...,{'title': 'Cooking Curry During College Lectur...,cooking curry college lecture love guy thank m...,0,True,True,False,True,False,False
10,yeSbUxW2M-4,clips that made Kai Cenat famous,clips that made Kai Cenat famous\nFOLLOW ME ON...,2022-12-31T17:00:38Z,Kai Cenat Live,"Kai Cenat, Kai Cenat Live, Kai Cenat Livestrea...",24,Entertainment,en-US,from 1 to 10 like how how like from 1 through ...,"{'title': 'clips that made Kai Cenat famous', ...",clip made kai cenat famous clip made kai cenat...,0,True,True,False,True,False,False


In [207]:
from better_profanity import profanity
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textstat import textstat

# Initialize better-profanity
profanity.load_censor_words()

# Function to check profanity and return unique words
def check_profanity(text):
    profanity_words = list(set([word for word in text.split() if profanity.contains_profanity(word)]))
    return profanity_words if profanity_words else None

# Function to check themes and return unique identified themes
def check_themes(text):
    sensitive_keywords = [
        "violence", "drugs", "sex", "explicit", "alcohol", "bomb", "gun", "fuck", 
        "abuse", "die", "kill", "unalive", "poison", "damn", "ghost", "scary", 
        "horror", "blood", "dead", "intimacy", "suggestive", "couple", "controversial", 
        "hot", "injuries", "romantic", "vampire", "shit", "sexy", "battle", "girlfriend", 
        "boyfriend", "scene kiss", "intercourse", "creepy", "naked", 
        "murder", "assault", "war", "massacre", "brutality", "stabbing", "shooting", 
        "gore", "torture", "hostility", "crime", "cocaine", "heroin", "nicotine", 
        "vaping", "addiction", "overdose", "substance abuse", "pills", "intoxicated", 
        "nudity", "erotic", "pornography", "fetish", "sensual", "lust", "strip", 
        "adult", "escort", "provocative", "cleavage", "lingerie", "scream", 
        "haunted", "supernatural", "zombie", "demon", "devil", "satanic", "cursed", 
        "exorcism", "nightmare", "fear", "terror", "trauma", "bullying", "harassment", 
        "suicide", "self-harm", "torture", "exploitation", "manipulation", 
        "oppression", "bastard", "hell", "slut", "whore", "damnation", "crap", 
        "douche", "asshole", "piss", "racism", "sexism", "discrimination", 
        "genocide", "terrorism", "dictatorship", "oppression", "xenophobia", 
        "cult", "stunt", "choking", "prank", "fire", "explosion", "theft", 
        "crime", "arrest", "kidnapping", "hostage", "escape", "affair", "adultery", 
        "infidelity", "lustful", "intimate", "sensual", "crush", "love triangle", 
        "gambling", "casino", "money laundering", "trafficking", "cheating", 
        "virus", "hacking", "dark web"
    ]
    text_lower = text.lower()
    identified_themes = list(set([keyword for keyword in sensitive_keywords if keyword in text_lower]))
    return identified_themes if identified_themes else None

# Function to check sentiment tone
def check_tone(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(text)["compound"]
    return sentiment_score

# Function to check cultural sensitivity
def check_cultural_sensitivity(text):
    offensive_phrases = ["racist", "stereotype", "offensive"]
    text_lower = text.lower()
    identified_offensive_phrases = list(set([phrase for phrase in offensive_phrases if phrase in text_lower]))
    return bool(identified_offensive_phrases), identified_offensive_phrases

# Function to apply analysis to each row, considering all fields and specific keyword checks
def apply_analysis(df):
    # Create a dictionary from text fields for analysis
    df["profanity_words"], df["themes_identified"], df["tone_score"], df["contains_cultural_sensitivity"], df["cultural_sensitivity_identified"] = zip(
        *df["combined_text"].apply(lambda x: (
            check_profanity(x), 
            check_themes(x), 
            check_tone(x), 
            *check_cultural_sensitivity(x)
        ))
    )
    return df

# Apply the analysis functions to the DataFrame
filtered_df = apply_analysis(filtered_df)


In [208]:
filtered_df

Unnamed: 0,video_id,title,description,published_at,channel_title,tags,category_id,category_label,default_audio_language,transcript,...,themes,readability,tone,cultural_sensitivity,kids_safe_content,profanity_words,themes_identified,tone_score,contains_cultural_sensitivity,cultural_sensitivity_identified
2,ZQbWWOzvyfo,Verizon,Switch to Fios Home Internet w/ no hidden fees...,2024-07-03T02:03:05Z,Verizon,,28,Science & Technology,en-US,[Music] my internet my rules introducing my ho...,...,False,False,False,False,False,,,-0.2023,False,[]
3,u4gEBRSKi2E,Is Being Fat A Choice? Fit Women vs Fat Women ...,Got injured in an accident? You could be one ...,2024-10-16T16:01:03Z,Jubilee,"jubilee, jubilee media, jubilee project, middl...",24,Entertainment,en,if you see me you consider me obese I wouldn't...,...,True,False,True,False,False,"[suck, crap, hell, fat, god]","[boyfriend, adult, kill, sex, lingerie, war, c...",1.0000,False,[]
7,T5niXqwAnME,Big Muscle Guy Scars RELEASED #health #muscles...,Support the channel with a membership! \nMembe...,2024-06-14T13:30:15Z,Mondragon Chiropractic,"asmr, relax, satisfying",27,Education,en,I don't have to get rid of the muscle just the...,...,True,False,True,False,False,,[war],0.9136,False,[]
9,mHZRFs-faX8,Cooking Curry During College Lecture!,Love you guys thank you so much for the suppor...,2024-10-13T02:00:02Z,Fique,"D'Aydrian Harding, Fred Beyer, Ash Alk, Zople,...",24,Entertainment,,[Music] so last video I asked you guys to get ...,...,True,False,True,False,False,"[hell, god, dong]","[escort, hell, fire, hot]",0.9998,False,[]
10,yeSbUxW2M-4,clips that made Kai Cenat famous,clips that made Kai Cenat famous\nFOLLOW ME ON...,2022-12-31T17:00:38Z,Kai Cenat Live,"Kai Cenat, Kai Cenat Live, Kai Cenat Livestrea...",24,Entertainment,en-US,from 1 to 10 like how how like from 1 through ...,...,True,False,True,False,False,"[hell, god, damn]","[hell, fire, damn, hot, gun]",0.9990,False,[]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2013,qEP2Uujafr4,Beetlejuice: The Forgotten Animated Series #sh...,#Beetlejuice #JennaOrtega #MichaelKeaton #timb...,2024-09-04T08:15:05Z,Barry_Is_H3r3,,24,Entertainment,en-US,Transcript not available,...,True,False,False,False,False,[hell],"[scream, kill, nightmare, demon, haunted, hell...",-0.9792,False,[]
2022,_jo6CEr-LKQ,Pre-Ranking Marvel Animation TV Shows Based On...,"In this video, we pre-rank the upcoming Marvel...",2024-06-30T13:42:54Z,DoomBlazer,,24,Entertainment,en-US,Transcript not available,...,True,False,True,False,False,,[zombie],0.9559,False,[]
2026,-WsZ2fUXbZg,The Night (Fan Animated),Watch the rest of the series: https://www.yout...,2016-07-15T15:19:09Z,Daria Cohen,"voltaire vampires the night twilight goth, hal...",1,Film & Animation,en,Transcript not available,...,True,False,True,False,False,,"[vampire, lust]",0.9001,False,[]
2036,T_evwI929vU,The New Norm: AWFUL Anti-Woke Cartoon,Twitter just unveiled their brand new â€œSouth P...,2024-06-28T01:54:26Z,LS Mark,,24,Entertainment,en-GB,Transcript not available,...,False,False,False,False,False,,,-0.5719,False,[]


In [209]:
filtered_df.columns

Index(['video_id', 'title', 'description', 'published_at', 'channel_title',
       'tags', 'category_id', 'category_label', 'default_audio_language',
       'transcript', 'text_dict', 'combined_text', 'kids_safe_BERT',
       'profanity', 'themes', 'readability', 'tone', 'cultural_sensitivity',
       'kids_safe_content', 'profanity_words', 'themes_identified',
       'tone_score', 'contains_cultural_sensitivity',
       'cultural_sensitivity_identified'],
      dtype='object')

In [210]:
filtered_df = filtered_df.drop(columns=["description", "tags", "default_audio_language", "transcript", "text_dict", "combined_text"])

print(filtered_df.head())  # View the modified DataFrame

       video_id                                              title  \
2   ZQbWWOzvyfo                                            Verizon   
3   u4gEBRSKi2E  Is Being Fat A Choice? Fit Women vs Fat Women ...   
7   T5niXqwAnME  Big Muscle Guy Scars RELEASED #health #muscles...   
9   mHZRFs-faX8              Cooking Curry During College Lecture!   
10  yeSbUxW2M-4                   clips that made Kai Cenat famous   

            published_at           channel_title  category_id  \
2   2024-07-03T02:03:05Z                 Verizon           28   
3   2024-10-16T16:01:03Z                 Jubilee           24   
7   2024-06-14T13:30:15Z  Mondragon Chiropractic           27   
9   2024-10-13T02:00:02Z                  Fique            24   
10  2022-12-31T17:00:38Z          Kai Cenat Live           24   

          category_label  kids_safe_BERT  profanity  themes  readability  \
2   Science & Technology               0      False   False        False   
3          Entertainment            

In [211]:
import datetime

# Get the current datetime and format it
current_time = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Save the DataFrame to an Excel file
excel_file_name = f"../data/raw/video_information_with_transcripts_network{current_time}.xlsx"
filtered_df.to_excel(excel_file_name, index=False)

# Save the DataFrame to a JSON file
json_file_name = f"../data/raw/video_information_with_transcripts_network{current_time}.json"
filtered_df.to_json(json_file_name, orient="records", lines=True)

print(f"Analysis complete. Results saved to '{excel_file_name}' and '{json_file_name}'.")


Analysis complete. Results saved to '../data/raw/video_information_with_transcripts_network2024-12-13_19-39-57.xlsx' and '../data/raw/video_information_with_transcripts_network2024-12-13_19-39-57.json'.
