In [22]:
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.applications.mobilenet_v2 import MobileNetV2, preprocess_input
import cv2
import pandas as pd

# Set the path to your dataset
dataset_path = '/Users/raghavraahul/Downloads/Machine Learning under a Modern Optimization Lens/ML Project'

# Define the image size based on the pre-trained model's requirements
img_size = (224, 224)

# Define the batch size
batch_size = 32

def preprocess_with_denoising(image):
    # Gaussian Blur for noise reduction
    denoised_image = cv2.GaussianBlur(image, (5, 5), 0)
    
    # Original preprocess_input function
    preprocessed_image = preprocess_input(denoised_image)

    return preprocessed_image

In [23]:
# ImageDataGenerator with noise reduction in preprocessing
datagen = ImageDataGenerator(
    preprocessing_function=preprocess_with_denoising,
    rescale=1./255,  # Rescale pixel values to [0, 1]
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# Create a MobileNetV2 model (assuming weights are set to 'imagenet')
base_model = MobileNetV2(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Create a data generator for your dataset
data_generator = datagen.flow_from_directory(
    os.path.join(dataset_path, 'trainUpdated'),
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',  # Use 'categorical' for one-hot encoded labels
    shuffle=False  # Set to False if you want to keep the order of predictions
)

Found 5000 images belonging to 2 classes.


In [24]:
# Generate embeddings for your images
image_embeddings = base_model.predict(data_generator, verbose=1)

# Get filenames and class labels from the generator
filenames = data_generator.filenames
class_labels = data_generator.classes

# Create a DataFrame to store results
embedding_df = pd.DataFrame({
    'filename': filenames,
    'class_label': class_labels,
    'embedding': [embedding.flatten() for embedding in image_embeddings]
})

# Now 'embedding_df' contains filenames, class labels, and corresponding embeddings
# You can save this DataFrame to a CSV file or use it for further analysis



In [25]:
embedding_df.to_csv("train_emb_5000samples.csv")

In [21]:
for embedding in image_embeddings[0:1]:
    print(sum(sum(sum(embedding == 0))))
    print(embedding.shape)
    print(sum(embedding.flatten() == 0))
    
    print(embedding)

52586
(7, 7, 1280)
52586
[[[0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  ...
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]]

 [[0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [2.0735483  0.         0.         ... 0.         0.         0.7089591 ]
  ...
  [0.         0.         0.         ... 0.         0.         1.5477629 ]
  [0.         0.         0.         ... 0.         0.         0.        ]
  [0.         0.         0.         ... 0.         0.         0.        ]]

 [[0.         0.         0.         ... 0.         0.         0.       

In [20]:
len(embedding_df['embedding'][0])

62720

In [28]:
embedding_df = embedding_df.rename(columns = {"filename": "id"})
embedding_df

Unnamed: 0,id,class_label,embedding
0,notHate/1024019861448667137.jpg,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,notHate/1024044331097882624.jpg,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,notHate/1024156840639299589.jpg,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,notHate/1024204435650764800.jpg,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,notHate/1024242185967943680.jpg,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
4995,zhate/1117633930360913920.jpg,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4996,zhate/1117645485605556225.jpg,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4997,zhate/1117648377049694208.jpg,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4998,zhate/1117654133836849153.jpg,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [34]:
id_embeddings = []
for id_ in embedding_df['id']:
    id_embeddings.append(id_.split("/")[-1][:-4])

In [37]:
embedding_df['id'] = id_embeddings

In [38]:
embedding_df

Unnamed: 0,id,class_label,embedding
0,1024019861448667137,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1024044331097882624,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1024156840639299589,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1024204435650764800,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1024242185967943680,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
4995,1117633930360913920,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4996,1117645485605556225,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4997,1117648377049694208,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4998,1117654133836849153,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [40]:
embedding_df = embedding_df.rename(columns = {"embedding": "image_embedding"})
train_embedding_df = embedding_df

In [42]:
data_generator = datagen.flow_from_directory(
    os.path.join(dataset_path, 'test'),
    target_size=img_size,
    batch_size=batch_size,
    class_mode='categorical',  # Use 'categorical' for one-hot encoded labels
    shuffle=False  # Set to False if you want to keep the order of predictions
)

Found 10000 images belonging to 2 classes.


In [43]:
# Generate embeddings for your images
image_embeddings = base_model.predict(data_generator, verbose=1)

# Get filenames and class labels from the generator
filenames = data_generator.filenames
class_labels = data_generator.classes

# Create a DataFrame to store results
test_embedding_df = pd.DataFrame({
    'filename': filenames,
    'class_label': class_labels,
    'embedding': [embedding.flatten() for embedding in image_embeddings]
})



In [44]:
test_embedding_df = test_embedding_df.rename(columns = {"filename": "id"})
id_embeddings = []
for id_ in test_embedding_df['id']:
    id_embeddings.append(id_.split("/")[-1][:-4])
test_embedding_df['id'] = id_embeddings

In [45]:
test_embedding_df

Unnamed: 0,id,class_label,embedding
0,1023943955023519744,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1023945847212568577,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1023959632648331269,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1023962536549523461,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1023967792096325632,0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...
9995,1117663481652752385,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9996,1117665258359988224,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9997,1117687201913954304,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9998,1117699493301035008,1,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [46]:
text_test_embedding = pd.read_csv("test_emb_.csv")
text_train_embedding = pd.read_csv("train_emb_.csv")

In [48]:
ids = []
for id_ in text_train_embedding['id']:
    ids.append(str(id_))

In [51]:
text_train_embedding['id'] = ids

In [54]:
ids = []
for id_ in text_test_embedding['id']:
    ids.append(str(id_))

In [55]:
text_test_embedding['id'] = ids

In [61]:
combined_test = pd.merge(text_test_embedding, test_embedding_df, on='id', how='inner').drop(["class_label"], axis = 1)

In [63]:
X_test = combined_test.drop(["label"], axis = 1)
X_test

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,759,760,761,762,763,764,765,766,767,embedding
0,1115275629476372480,0.228511,-0.163395,2.021772,0.359851,0.392020,0.040908,-0.343978,0.705852,-0.412955,...,-0.453159,0.232846,-0.714774,-0.354411,-0.225023,-0.116535,-0.181649,-0.209135,-0.226190,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1114404944499625984,0.262369,0.455420,1.584039,0.253371,0.684186,0.088838,-0.160183,1.058388,-0.151010,...,-0.764990,-0.293814,-0.144211,-0.040668,-0.135203,-0.000222,-0.313114,0.066856,-0.019078,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1107329660558471171,0.263806,1.492010,0.998904,0.384446,-0.364752,-0.132873,1.341616,0.458849,0.146303,...,0.404646,-0.548889,-1.670956,-0.238885,0.205108,-0.219107,-0.036667,0.100636,0.685181,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1107665629409099777,-0.135046,0.394240,0.732052,0.053892,0.332932,-0.074404,1.224470,-0.352085,0.503350,...,0.019033,-0.460863,-0.364556,-0.339883,0.128724,-0.581635,-0.475554,-0.432779,0.282620,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1117318148275961856,-0.065361,0.627089,1.278918,0.247762,-0.092058,-0.167238,0.240847,0.035854,0.065943,...,-0.755343,-0.918214,-1.348494,0.133400,-0.689744,-0.135027,0.165788,0.311215,0.403350,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1115510593635737600,0.388776,0.309589,1.625800,0.386981,-0.170552,0.350368,0.974769,0.694503,-0.031587,...,0.110999,-0.659299,-1.698783,0.023936,0.243084,0.138803,-0.648079,0.062269,0.087376,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9996,1106873491771527173,0.153582,1.734947,-0.012495,-0.117608,0.123374,-0.190368,0.597460,-0.027715,0.089972,...,1.001420,-1.172329,-0.749610,-0.036686,-0.740092,-0.504160,-0.190793,0.798316,0.281052,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9997,1105704363635429376,0.044173,0.459099,1.986406,0.390443,-0.106719,-0.294572,0.042843,0.670211,0.003767,...,-0.714606,-0.064828,-1.140476,-0.480226,-0.461627,-0.210900,0.414037,-0.322381,-0.081314,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
9998,1105647057061191680,0.017108,-0.140205,2.510149,0.195211,0.069391,0.905015,-0.258025,-0.040380,-0.247617,...,0.344900,0.337780,-1.316026,-0.600757,-0.497245,-0.302731,0.117333,-0.744669,0.108352,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [64]:
y_test = combined_test["label"]
y_test

0       0
1       0
2       0
3       0
4       1
       ..
9995    0
9996    0
9997    0
9998    1
9999    1
Name: label, Length: 10000, dtype: int64

In [65]:
combined_train = pd.merge(text_train_embedding, train_embedding_df, on='id', how='inner').drop(["class_label"], axis = 1)

In [66]:
combined_train

Unnamed: 0,id,label,0,1,2,3,4,5,6,7,...,759,760,761,762,763,764,765,766,767,image_embedding
0,1106854691105726464,0,-0.469343,0.678255,0.106478,-0.212503,-1.098321,0.343147,0.965310,0.197648,...,0.661813,-0.497664,-1.750491,-0.270639,-0.318183,0.092757,-0.459996,0.340841,0.476743,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1115009758514360320,0,0.123494,0.052396,2.078528,0.103092,0.576196,0.486698,0.361882,0.045688,...,-0.431231,-0.653512,-0.360195,0.075113,-0.489045,-0.368887,-0.299407,-0.194543,0.244075,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1037441000061337600,0,0.408084,0.615082,0.860860,0.580481,0.335167,-0.178345,-0.114313,0.037871,...,0.140666,-0.432491,-0.767938,0.014775,-1.008069,0.308640,-0.470854,0.193626,0.816066,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1115672935912677376,0,0.395853,1.040859,0.843173,0.640891,0.174991,0.150899,0.461329,0.220517,...,0.737009,-0.428347,0.088643,-0.092198,-0.544194,0.040380,-0.016858,0.138859,-0.252325,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1107373811547209729,0,-0.025087,0.777523,0.397383,-0.324872,-0.390356,-0.347750,0.934251,0.851826,...,0.179234,-0.732651,-1.857142,0.102758,-0.241321,-0.294072,-0.576955,0.400550,0.772557,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1113466187487285249,1,0.146537,0.597480,0.683606,0.059339,-0.105274,0.495159,0.629954,0.238374,...,-0.473266,-0.873265,-1.424416,0.348938,-1.114116,-0.030731,-0.074233,-0.047757,0.287868,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4996,1109074586551894016,1,0.240540,-0.138524,1.596464,0.160773,0.559376,0.148500,-0.634073,0.578082,...,-0.992453,-0.438521,0.313641,0.342303,-0.833256,-0.340481,0.262743,0.321054,0.088676,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4997,1045845047533596672,1,-0.188668,0.476852,0.223402,-0.398933,0.187382,0.030374,0.576362,-0.268622,...,-0.149204,-0.280591,-0.573578,0.490640,-0.567130,0.154618,-0.720426,0.218188,0.004827,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4998,1105465552544374786,1,0.332450,0.677665,2.233109,-0.101504,0.260277,0.033019,0.189212,1.030952,...,-0.678651,0.412314,-1.554112,0.101258,-0.270540,-0.613195,-0.009996,-0.506984,-0.158411,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [67]:
X_train = combined_train.drop(["label"], axis = 1)
X_train

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,759,760,761,762,763,764,765,766,767,image_embedding
0,1106854691105726464,-0.469343,0.678255,0.106478,-0.212503,-1.098321,0.343147,0.965310,0.197648,0.542697,...,0.661813,-0.497664,-1.750491,-0.270639,-0.318183,0.092757,-0.459996,0.340841,0.476743,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,1115009758514360320,0.123494,0.052396,2.078528,0.103092,0.576196,0.486698,0.361882,0.045688,-0.507420,...,-0.431231,-0.653512,-0.360195,0.075113,-0.489045,-0.368887,-0.299407,-0.194543,0.244075,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,1037441000061337600,0.408084,0.615082,0.860860,0.580481,0.335167,-0.178345,-0.114313,0.037871,-0.243999,...,0.140666,-0.432491,-0.767938,0.014775,-1.008069,0.308640,-0.470854,0.193626,0.816066,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,1115672935912677376,0.395853,1.040859,0.843173,0.640891,0.174991,0.150899,0.461329,0.220517,-0.441235,...,0.737009,-0.428347,0.088643,-0.092198,-0.544194,0.040380,-0.016858,0.138859,-0.252325,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,1107373811547209729,-0.025087,0.777523,0.397383,-0.324872,-0.390356,-0.347750,0.934251,0.851826,0.442442,...,0.179234,-0.732651,-1.857142,0.102758,-0.241321,-0.294072,-0.576955,0.400550,0.772557,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1113466187487285249,0.146537,0.597480,0.683606,0.059339,-0.105274,0.495159,0.629954,0.238374,-0.185707,...,-0.473266,-0.873265,-1.424416,0.348938,-1.114116,-0.030731,-0.074233,-0.047757,0.287868,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4996,1109074586551894016,0.240540,-0.138524,1.596464,0.160773,0.559376,0.148500,-0.634073,0.578082,-0.292847,...,-0.992453,-0.438521,0.313641,0.342303,-0.833256,-0.340481,0.262743,0.321054,0.088676,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4997,1045845047533596672,-0.188668,0.476852,0.223402,-0.398933,0.187382,0.030374,0.576362,-0.268622,0.228631,...,-0.149204,-0.280591,-0.573578,0.490640,-0.567130,0.154618,-0.720426,0.218188,0.004827,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4998,1105465552544374786,0.332450,0.677665,2.233109,-0.101504,0.260277,0.033019,0.189212,1.030952,-0.043322,...,-0.678651,0.412314,-1.554112,0.101258,-0.270540,-0.613195,-0.009996,-0.506984,-0.158411,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [68]:
y_train = combined_train["label"]

In [69]:
y_train

0       0
1       0
2       0
3       0
4       0
       ..
4995    1
4996    1
4997    1
4998    1
4999    1
Name: label, Length: 5000, dtype: int64

In [70]:
X_train.to_csv("X_train.csv")

In [71]:
X_test.to_csv("X_test.csv")

In [72]:
y_train.to_csv("y_train.csv")

In [73]:
y_test.to_csv("y_test.csv")

In [74]:
pd.DataFrame(X_train['image_embedding'].apply(pd.Series))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,62710,62711,62712,62713,62714,62715,62716,62717,62718,62719
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.892784,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.956246,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.819044,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.865078,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.236076,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.036329,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.356247,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.492528,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.837289,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
X_train_df = pd.concat([X_train, pd.DataFrame(X_train['image_embedding'].apply(pd.Series))], axis=1).drop(columns=['image_embedding'])

In [76]:
X_train_df

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,62710,62711,62712,62713,62714,62715,62716,62717,62718,62719
0,1106854691105726464,-0.469343,0.678255,0.106478,-0.212503,-1.098321,0.343147,0.965310,0.197648,0.542697,...,0.0,0.0,0.892784,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1115009758514360320,0.123494,0.052396,2.078528,0.103092,0.576196,0.486698,0.361882,0.045688,-0.507420,...,0.0,0.0,0.956246,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1037441000061337600,0.408084,0.615082,0.860860,0.580481,0.335167,-0.178345,-0.114313,0.037871,-0.243999,...,0.0,0.0,0.819044,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1115672935912677376,0.395853,1.040859,0.843173,0.640891,0.174991,0.150899,0.461329,0.220517,-0.441235,...,0.0,0.0,0.865078,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1107373811547209729,-0.025087,0.777523,0.397383,-0.324872,-0.390356,-0.347750,0.934251,0.851826,0.442442,...,0.0,0.0,1.236076,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,1113466187487285249,0.146537,0.597480,0.683606,0.059339,-0.105274,0.495159,0.629954,0.238374,-0.185707,...,0.0,0.0,1.036329,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,1109074586551894016,0.240540,-0.138524,1.596464,0.160773,0.559376,0.148500,-0.634073,0.578082,-0.292847,...,0.0,0.0,0.356247,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,1045845047533596672,-0.188668,0.476852,0.223402,-0.398933,0.187382,0.030374,0.576362,-0.268622,0.228631,...,0.0,0.0,1.492528,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,1105465552544374786,0.332450,0.677665,2.233109,-0.101504,0.260277,0.033019,0.189212,1.030952,-0.043322,...,0.0,0.0,0.837289,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [80]:
X_test_df = pd.concat([X_test, pd.DataFrame(X_test['embedding'].apply(pd.Series))], axis=1).drop(columns=['embedding'])

In [81]:
X_test_df

Unnamed: 0,id,0,1,2,3,4,5,6,7,8,...,62710,62711,62712,62713,62714,62715,62716,62717,62718,62719
0,1115275629476372480,0.228511,-0.163395,2.021772,0.359851,0.392020,0.040908,-0.343978,0.705852,-0.412955,...,0.0,0.0,1.008579,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1114404944499625984,0.262369,0.455420,1.584039,0.253371,0.684186,0.088838,-0.160183,1.058388,-0.151010,...,0.0,0.0,1.132815,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1107329660558471171,0.263806,1.492010,0.998904,0.384446,-0.364752,-0.132873,1.341616,0.458849,0.146303,...,0.0,0.0,1.205007,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1107665629409099777,-0.135046,0.394240,0.732052,0.053892,0.332932,-0.074404,1.224470,-0.352085,0.503350,...,0.0,0.0,0.978668,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1117318148275961856,-0.065361,0.627089,1.278918,0.247762,-0.092058,-0.167238,0.240847,0.035854,0.065943,...,0.0,0.0,1.064722,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1115510593635737600,0.388776,0.309589,1.625800,0.386981,-0.170552,0.350368,0.974769,0.694503,-0.031587,...,0.0,0.0,0.601591,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9996,1106873491771527173,0.153582,1.734947,-0.012495,-0.117608,0.123374,-0.190368,0.597460,-0.027715,0.089972,...,0.0,0.0,0.998096,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9997,1105704363635429376,0.044173,0.459099,1.986406,0.390443,-0.106719,-0.294572,0.042843,0.670211,0.003767,...,0.0,0.0,0.048081,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9998,1105647057061191680,0.017108,-0.140205,2.510149,0.195211,0.069391,0.905015,-0.258025,-0.040380,-0.247617,...,0.0,0.0,1.015086,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
X_test_df.to_csv("X_test_updated.csv")
X_train_df.to_csv("X_train_updated.csv")