# Text & Image & Video Feature Extraction

In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
os.chdir('data')
os.getcwd()

'/root/preprocessing/data'

## Load text data

In [3]:
i_id, item_str = 'itemID', 'item'

file_path = './'
file_name = 'meta_MicroLens.csv'

meta_file = os.path.join(file_path, file_name)

df = pd.read_csv(meta_file)
df.sort_values(by=[i_id], inplace=True)

print('data loaded!')
print(f'shape: {df.shape}')

df[:3]


data loaded!
shape: (19738, 4)


Unnamed: 0,itemID,item,title,tags
9579,0,9580,the best place to destroy the most wonderful ...,Game
14630,1,14631,# King Glory national Service Dharma being sp...,Game
7314,2,7315,# King glory country serve Damo old table pla...,Game


In [4]:

title_na_df = df[df['title'].isnull()]
print(title_na_df.shape)

desc_na_df = df[df['tags'].isnull()]
print(desc_na_df.shape)


(0, 4)
(0, 4)


In [5]:
df['title'] = df['title'].fillna(" ")
df['tags'] = df['tags'].fillna(" ")

In [6]:
# Since only the title feature is available, construct sentences using the title.
sentences = []

for i, row in df.iterrows():
    sen = row['title'] + ' ' + row['tags'] + ' '
    sen = sen.replace('\n', ' ')
    sentences.append(sen)

sentences[:10]

[' the best place to destroy the most wonderful team with the fastest speed Game ',
 ' # King Glory national Service Dharma being sprayed vegetable is really like this Game ',
 ' # King glory country serve Damo old table play Lv Bu Game ',
 ' MC sees creatures that replicate? Raise an army of iron puppets for the village! Real Village Heroes # QuickHands My World # Myworld #MC  Game ',
 " # King of Glory served Dharma hit the peak of ridicule. I'm back Game ",
 ' this is a 2100 division robbery bureau Game ',
 " campus weird talk: roommate Ajie's prank! # thrillsuspenseful # schoolyard Weirtalk Daily Sharing ",
 '  Daily Sharing ',
 ' made a big box of star cedars ❄️ handmade soap ~ The New Year also want to be like a small pine 🌲 hard and lucky life ✨ #Vlog daily # handmade soap cold soap Daily Sharing ',
 ' too hard # assist Game ']

In [7]:
course_list = df[i_id].tolist()
assert course_list[-1] == len(course_list) -1

In [8]:

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

sentence_embeddings = model.encode(sentences)
print('text encoded!')

assert sentence_embeddings.shape[0] == df.shape[0]
np.save(os.path.join(file_path, 'text_feat.npy'), sentence_embeddings)
print('done!')


  return self.fget.__get__(instance, owner)()


text encoded!
done!


In [9]:
sentence_embeddings[:10]
sentence_embeddings.shape

(19738, 384)

In [10]:
load_txt_feat = np.load('text_feat.npy', allow_pickle=True)
print(load_txt_feat.shape)
load_txt_feat[:10]

(19738, 384)


array([[ 3.4294955e-02,  8.4141500e-02, -4.6296481e-02, ...,
         5.9475612e-02,  5.2324805e-02,  1.5298225e-02],
       [-4.8817385e-02,  2.5659388e-02, -4.2193025e-02, ...,
        -8.7688394e-02,  1.8543907e-03, -6.9620837e-05],
       [-1.1217424e-05,  3.7120316e-02, -4.7777954e-02, ...,
        -2.2200752e-02, -3.2746003e-03, -5.9879858e-02],
       ...,
       [-5.6936350e-02,  2.4288543e-02,  6.9684954e-03, ...,
         3.8671199e-02, -1.2934662e-01,  5.9208952e-02],
       [-7.6627605e-02,  4.4511382e-02,  6.9404043e-02, ...,
         9.6225264e-03, -1.0243885e-01,  8.3619624e-02],
       [-2.0027051e-02, -4.3830775e-02,  9.6338410e-03, ...,
         4.6530124e-02,  7.5272918e-03, -4.7684433e-03]], dtype=float32)

# Image encoder, use VIT

In [11]:
from transformers import ViTFeatureExtractor, ViTModel
import numpy as np
from PIL import Image
import pandas as pd
import torch
# Load the CSV file and sort by itemID.
df = pd.read_csv('meta_MicroLens.csv')
df.sort_values(by='itemID', inplace=True)
print(df.head())

       itemID   item                                              title  tags
9579        0   9580   the best place to destroy the most wonderful ...  Game
14630       1  14631   # King Glory national Service Dharma being sp...  Game
7314        2   7315   # King glory country serve Damo old table pla...  Game
1954        3   1955   MC sees creatures that replicate? Raise an ar...  Game
9942        4   9943   # King of Glory served Dharma hit the peak of...  Game


In [13]:

# Check if GPU is available, if so, set it as the default device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Initialize the ViT feature extractor and model, and move them to the device
feature_extractor = ViTFeatureExtractor.from_pretrained('vit-base-patch16-224')
model = ViTModel.from_pretrained('vit-base-patch16-224').to(device)

def extract_features(image_path, feature_extractor, model, device):
    # Open and process the image
    image = Image.open(image_path)
    inputs = feature_extractor(images=image, return_tensors="pt")
    inputs = {k: v.to(device) for k, v in inputs.items()}  # Move inputs to GPU

    # Perform model inference on the GPU and extract features
    with torch.no_grad():
        outputs = model(**inputs)
    features = outputs.last_hidden_state[:, 0, :].cpu().detach().numpy()  # Move features back to CPU
    return features.squeeze()

# Preset the image directory path
images_dir = '/root/autodl-tmp/MicroLens-100k_covers'

# Extract features
features_list = []
for _, row in df.iterrows():
    item = row['item']
    image_path = f'{images_dir}/{item}.jpg'
    print(f"Processing item {item}.")
    try:
        features = extract_features(image_path, feature_extractor, model, device)
        features_list.append(features)
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        # For images that cannot be processed, you may need to add a placeholder or skip them

# Convert to numpy array and save
features_array = np.array(features_list)
np.save('image_feat.npy', features_array)

print('Image features extracted and saved!')



Using device: cuda


Some weights of ViTModel were not initialized from the model checkpoint at vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Processing item 9580.
Processing item 14631.
Processing item 7315.
Processing item 1955.


Processing item 9943.
Processing item 12077.
Processing item 8411.
Processing item 16821.
Processing item 13832.
Processing item 7771.
Processing item 9973.
Processing item 6763.
Processing item 2550.
Processing item 5266.
Processing item 8316.
Processing item 11733.
Processing item 16425.
Processing item 4253.
Processing item 1148.
Processing item 10589.
Processing item 621.
Processing item 11802.
Processing item 14283.
Processing item 18491.
Processing item 14394.
Processing item 5865.
Processing item 18490.
Processing item 3129.
Processing item 14044.
Processing item 11246.
Processing item 5535.
Processing item 18257.
Processing item 11698.
Processing item 15194.
Processing item 19120.
Processing item 9949.
Processing item 18584.
Processing item 12164.
Processing item 10509.
Processing item 14958.
Processing item 11527.
Processing item 12104.
Processing item 12109.
Processing item 1099.
Processing item 7454.
Processing item 5614.
Processing item 12735.
Processing item 4148.
Processi

In [14]:
features_array.shape

(19738, 768)

## Video Encoder (Using the original video data)
## "num_frames": 16

In [15]:
import cv2
import numpy as np
import pandas as pd
import torch
import os
import gc  
from video_swin_transformer import SwinTransformer3D
from collections import OrderedDict

In [16]:


# Set device for PyTorch operations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and sort DataFrame
df = pd.read_csv('meta-simple.csv')
df.sort_values(by='itemID', inplace=True)

# Prepare directory for video features
features_dir = 'video_features'
os.makedirs(features_dir, exist_ok=True)

# List to store paths to feature files
temp_features_list = []

def load_and_preprocess_video(video_path, desired_num_frames=16):
    cap = cv2.VideoCapture(video_path)
    frames = []
    try:
        while True:
            ret, frame = cap.read()
            if not ret:
                break
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            mean = np.array([123.675, 116.28, 103.53])
            std = np.array([58.395, 57.12, 57.375])
            frame = (frame - mean) / std
            frames.append(frame)
            if len(frames) == desired_num_frames:
                break
    finally:
        cap.release()
    
    target_size = (224, 224)
    resized_frames = [cv2.resize(frame, target_size) for frame in frames]
    batch = np.stack(resized_frames)
    batch = batch.transpose((0, 3, 1, 2))
    batch = np.expand_dims(batch, axis=0)
    return batch

# Initialize video feature extractor
extract_video = SwinTransformer3D(
    embed_dim=128,
    depths=[2, 2, 18, 2],
    num_heads=[4, 8, 16, 32],
    patch_size=(2, 4, 4),
    window_size=(8, 7, 7),
    drop_path_rate=0.0,
    patch_norm=True
)

# extract_video.load_state_dict(torch.load('./swin_base_patch244_window877_kinetics400_1k.pth'))
# extract_video = extract_video.to(device)

checkpoint = torch.load('./swin_base_patch244_window877_kinetics600_22k.pth')
new_state_dict = OrderedDict()
for k, v in checkpoint['state_dict'].items():
    # print(k)
    if 'backbone' in k:
        name = k[9:]
        new_state_dict[name] = v
extract_video.load_state_dict(new_state_dict)
extract_video = extract_video.to(device)


def extract_video_features(video_path, desired_num_frames):
    batch = load_and_preprocess_video(video_path, desired_num_frames)
    inputs = torch.tensor(batch).float().to(device)
    with torch.no_grad():
        features = extract_video(inputs.permute(0, 2, 1, 3, 4))
        features = torch.nn.functional.adaptive_avg_pool2d(features, (1, 1))
        features = features.squeeze(4).squeeze(3).permute(0, 2, 1)
    print(features.shape)
    return features.cpu().numpy()

videos_dir = '/root/autodl-tmp/simple/downloaded_videos'

# Process videos and extract features
for _, row in df.iterrows():
    item = row['item']
    video_path = os.path.join(videos_dir, f'{item}.mp4')
    temp_features_path = os.path.join(features_dir, f'temp_{item}.npy')
    print(f"Processing item {item}.")
    try:
        features = extract_video_features(video_path, desired_num_frames=16)
        np.save(temp_features_path, features.squeeze())
        temp_features_list.append(temp_features_path)
    except Exception as e:
        print(f"Error processing video {video_path}: {e}")
    gc.collect()
    if device.type == 'cuda':
        torch.cuda.empty_cache()

# Load and stack all features
all_features = []
for _, row in df.iterrows():
    item = row['item']
    temp_features_path = os.path.join(features_dir, f'temp_{item}.npy')
    features = np.load(temp_features_path)
    all_features.append(features)

all_features_array = np.stack(all_features)
np.save('oral_video_feat.npy', all_features_array)

print('All video features extracted and saved into a single file!')


In [17]:

all_features_array[:1]

In [18]:
all_features_array.shape

In [19]:
npy_file_path = 'oral_video_feat.npy'  # Change this to your file's path
npy_array = np.load(npy_file_path)

# Modify the array dimensions by averaging over the second dimension
npy_array_modified = npy_array.mean(axis=1)

# Optionally, save the modified array back to a file
modified_npy_file_path = 'video_feat.npy'  # Change this to your desired output file's path
np.save(modified_npy_file_path, npy_array_modified)

# If you want to check or use the modified array's shape
print(f"Modified array shape: {npy_array_modified.shape}")

## Video Encoder(Using video slicing data)
## "num_frames": 5,

In [21]:
import torch
import numpy as np
import pandas as pd
import os
import cv2
import gc
from video_swin_transformer import SwinTransformer3D

from collections import OrderedDict


In [22]:
count = 0

# Set device for PyTorch operations
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load and sort DataFrame
df = pd.read_csv('meta_MicroLens.csv')
df.sort_values(by='itemID', inplace=True)

# Prepare directory for video features
features_dir = 'video_features'
os.makedirs(features_dir, exist_ok=True)

# List to store paths to feature files
temp_features_list = []

def load_and_preprocess_images(image_paths, target_size=(224, 224)):
    frames = []
    for path in image_paths:
        frame = cv2.imread(path)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        mean = np.array([123.675, 116.28, 103.53])
        std = np.array([58.395, 57.12, 57.375])
        frame = (frame - mean) / std
        frame = cv2.resize(frame, target_size)
        frames.append(frame)
    batch = np.stack(frames)
    batch = batch.transpose((0, 3, 1, 2))
    batch = np.expand_dims(batch, axis=0)
    return batch

# Initialize video feature extractor (same as before)
extract_video = SwinTransformer3D(
    embed_dim=128,
    depths=[2, 2, 18, 2],
    num_heads=[4, 8, 16, 32],
    patch_size=(2, 4, 4),
    window_size=(8, 7, 7),
    drop_path_rate=0.0,
    patch_norm=True
)

# Load model checkpoint (same as before)
checkpoint = torch.load('./swin_base_patch244_window877_kinetics600_22k.pth')
new_state_dict = OrderedDict()
for k, v in checkpoint['state_dict'].items():
    if 'backbone' in k:
        name = k[9:]
        new_state_dict[name] = v
extract_video.load_state_dict(new_state_dict)
extract_video = extract_video.to(device)

def extract_features_from_images(image_folder, item, desired_num_frames=5):
    image_paths = [os.path.join(image_folder, f'{item}-{i}.jpg') for i in range(1, desired_num_frames+1)]
    batch = load_and_preprocess_images(image_paths)
    inputs = torch.tensor(batch).float().to(device)
    with torch.no_grad():
        features = extract_video(inputs.permute(0, 2, 1, 3, 4))
        features = torch.nn.functional.adaptive_avg_pool2d(features, (1, 1))
        features = features.squeeze(4).squeeze(3).permute(0, 2, 1)
    print(features.shape)
    return features.cpu().numpy()

images_dir = '/root/autodl-tmp/MicroLens-100k_frames_interval_1_number_5'

# Process images and extract features
for _, row in df.iterrows():
    count += 1
    item = row['item']
    temp_features_path = os.path.join(features_dir, f'temp_{item}.npy')
    print(f"Processing item {item}.")
    print(f"number {count}.")
    try:
        features = extract_features_from_images(images_dir, item, desired_num_frames=5)
        np.save(temp_features_path, features.squeeze())
        temp_features_list.append(temp_features_path)
    except Exception as e:
        print(f"Error processing images for item {item}: {e}")
    gc.collect()
    if device.type == 'cuda':
        torch.cuda.empty_cache()

# Load and stack all features (same as before)
all_features = []
for path in temp_features_list:
    features = np.load(path)
    all_features.append(features)

all_features_array = np.stack(all_features)
np.save('oral_video_feat.npy', all_features_array)

print('All features extracted and saved into a single file!')


Using device: cuda


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


Processing item 9580.
number 1.
torch.Size([1, 3, 1024])
Processing item 14631.
number 2.
torch.Size([1, 3, 1024])
Processing item 7315.
number 3.
torch.Size([1, 3, 1024])
Processing item 1955.
number 4.
torch.Size([1, 3, 1024])
Processing item 9943.
number 5.
torch.Size([1, 3, 1024])
Processing item 12077.
number 6.
torch.Size([1, 3, 1024])
Processing item 8411.
number 7.
torch.Size([1, 3, 1024])
Processing item 16821.
number 8.
torch.Size([1, 3, 1024])
Processing item 13832.
number 9.
torch.Size([1, 3, 1024])
Processing item 7771.
number 10.
torch.Size([1, 3, 1024])
Processing item 9973.
number 11.
torch.Size([1, 3, 1024])
Processing item 6763.
number 12.
torch.Size([1, 3, 1024])
Processing item 2550.
number 13.
torch.Size([1, 3, 1024])
Processing item 5266.
number 14.
torch.Size([1, 3, 1024])
Processing item 8316.
number 15.
torch.Size([1, 3, 1024])
Processing item 11733.
number 16.
torch.Size([1, 3, 1024])
Processing item 16425.
number 17.
torch.Size([1, 3, 1024])
Processing item 4

In [23]:
all_features_array.shape

(19738, 3, 1024)

In [24]:
npy_file_path = 'oral_video_feat.npy'  # Change this to your file's path
npy_array = np.load(npy_file_path)

# Modify the array dimensions by averaging over the second dimension
npy_array_modified = npy_array.mean(axis=1)

# Optionally, save the modified array back to a file
modified_npy_file_path = 'video_feat.npy'  # Change this to your desired output file's path
np.save(modified_npy_file_path, npy_array_modified)

# If you want to check or use the modified array's shape
print(f"Modified array shape: {npy_array_modified.shape}")

Modified array shape: (19738, 1024)


In [3]:
import numpy as np

# Load the .npy file
original_data = np.load('oral_video_feat.npy')

# Check the original shape
print("Original shape:", original_data.shape)

# Reshape the data to (19738, 3072)
reshaped_data = original_data.reshape(19738, -1)

# Check the new shape
print("Reshaped shape:", reshaped_data.shape)

# Save the reshaped array to a new .npy file
np.save('oral_video_feat_reshaped.npy', reshaped_data)

print("Reshaped data saved successfully.")


Original shape: (19738, 3, 1024)
Reshaped shape: (19738, 3072)
Reshaped data saved successfully.
