---
title: "Data Cleaning"
format:
    html: 
        code-fold: false
---

{{< include overview.qmd >}} 

<!-- After digesting the instructions, you can delete this cell, these are assignment instructions and do not need to be included in your final submission.  -->
{{< include methods.qmd >}} 

# Code 

In [1]:
import pandas as pd
import re
from langdetect import detect
import nltk
from nltk.corpus import stopwords
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import numpy as np
import re
from langdetect import detect
from nltk.corpus import stopwords


nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shenyuxi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
data = pd.read_csv("../../data/raw-data/youtube_data_raw.csv", index_col=0)

In [3]:

import pandas as pd
import numpy as np
import re
from langdetect import detect
from sklearn.preprocessing import MinMaxScaler
from nltk.corpus import stopwords

# Define the function to classify multi-class popularity based on view count
def classify_popularity(view_count):
    if view_count < 1_000_000:  # Less than 1 million
        return "low"
    elif 1_000_000 <= view_count < 6_000_000:  # Between 1 million and 6 million
        return "medium"
    else:  # Greater than or equal to 6 million
        return "high"

# Define the function to classify binary popularity
def classify_binary_popularity(view_count):
    return "high" if view_count >= 6_000_000 else "low"

# Fill NaN values with "nan"
data = data.fillna("nan")
data["likeCount"] = data["likeCount"].fillna(0)
data["dislikeCount"] = data["dislikeCount"].fillna(0)
data["commentCount"] = data["commentCount"].fillna(0)
# Function to remove punctuation from text
def remove_punctuation(text):
    if pd.isna(text):
        return text
    return re.sub(r'[^\w\s]', '', str(text))  

# Function to clean and remove unnecessary symbols from tags
def remove_tags(text):
    if pd.isna(text):
        return "nan"
    
    stop_words = set(stopwords.words('english'))
    
    text = re.sub(r"[^\w\s]", "", str(text))
    
    words = text.lower().split()  
    filtered_words = [word for word in words if word not in stop_words] 
    return ' '.join(filtered_words)

# Function to extract topics from the 'topicCategories' field
def extract_topics(topic_str):
    if pd.isna(topic_str):
        return "nan"
    matches = re.findall(r"wiki/([^\']+)", topic_str)
    return " ".join(matches)

# Function to check if a text is in English
def is_english(text):
    try:
        return detect(text) == "en"
    except:
        return False

# Function to convert duration to seconds
def convert_to_seconds(duration):
    hours = minutes = seconds = 0
    match = re.match(r"PT(\d+H)?(\d+M)?(\d+S)?", duration)
    
    if match:
        if match.group(1):  # Hours
            hours = int(match.group(1)[:-1])  # Remove 'H' and convert to int
        if match.group(2):  # Minutes
            minutes = int(match.group(2)[:-1])  # Remove 'M' and convert to int
        if match.group(3):  # Seconds
            seconds = int(match.group(3)[:-1])  # Remove 'S' and convert to int
            
    total_seconds = hours * 3600 + minutes * 60 + seconds
    return total_seconds

# Standardize view count
def normalize_view_count(view_counts):
    scaler = MinMaxScaler()
    # Reshape the data to fit the scaler
    view_counts_reshaped = np.array(view_counts).reshape(-1, 1)
    normalized = scaler.fit_transform(view_counts_reshaped)
    return normalized.flatten()

# Classify popularity before standardizing 'viewCount'
if 'viewCount' in data.columns:
    data["popularity_multi_class"] = data["viewCount"].apply(classify_popularity)
    data["popularity"] = data["viewCount"].apply(classify_binary_popularity)

# Iterate through the columns and apply respective cleaning functions
for column in data.columns:
    if column == 'topicCategories':
        # Apply the topic extraction and punctuation removal functions
        data[column] = data[column].apply(extract_topics)  
    elif column == 'title':
        # Keep only rows with English titles and remove punctuation
        data = data[data[column].apply(is_english)]  # Keep only rows with English titles
        data[column] = data[column].apply(remove_punctuation)
    elif column == 'duration':
        data['duration'] = data['duration'].apply(convert_to_seconds)
    elif column == 'tags':
        # Clean tags and remove unwanted symbols
        data[column] = data[column].apply(remove_tags)

# Standardize 'viewCount' column if it exists
if 'viewCount' in data.columns:
    data['viewCount'] = normalize_view_count(data['viewCount'].astype(float))

# Standardize 'likeCount' column if it exists
if 'likeCount' in data.columns:
    data['likeCount'] = normalize_view_count(data['likeCount'].astype(float))

if 'commentCount' in data.columns:
    data['commentCount'] = normalize_view_count(data['commentCount'].astype(float))

# Drop duplicate entries based on 'video_id'
data = data.drop_duplicates("video_id")

# Define the relative output path for saving the cleaned data
output_path = '../../data/processed-data/cleaned_data.csv'  # Relative path from current script
data.to_csv(output_path, index=False)  # Save the cleaned data to CSV
print(f"Cleaned data saved to {output_path}")




Cleaned data saved to ../../data/processed-data/cleaned_data.csv


In [4]:
len(data)

2202

{{< include closing.qmd >}} 