In [10]:
# Import necessary libraries
import pandas as pd

# Load all datasets
train_data = pd.read_csv('train_data.csv')
train_labels = pd.read_csv('train_labels.csv')
train_embeddings = pd.read_csv('train_embeddings.csv')
test_data = pd.read_csv('test_data.csv')
test_embeddings = pd.read_csv('test_embeddings.csv')

# Display basic info for each dataset
datasets = {
    "Train Data": train_data,
    "Train Labels": train_labels,
    "Train Embeddings": train_embeddings,
    "Test Data": test_data,
    "Test Embeddings": test_embeddings,
}

for name, df in datasets.items():
    print(f"\n{name} Overview:")
    print(df.info())
    print(df.describe())
    print(df.head())



Train Data Overview:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Columns: 785 entries, label to 28x28
dtypes: int64(785)
memory usage: 785.1 KB
None
            label    1x1    1x2    1x3    1x4    1x5    1x6    1x7    1x8  \
count  128.000000  128.0  128.0  128.0  128.0  128.0  128.0  128.0  128.0   
mean     4.109375    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
std      2.781362    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
min      0.000000    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
25%      2.000000    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
50%      4.000000    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
75%      7.000000    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   
max      9.000000    0.0    0.0    0.0    0.0    0.0    0.0    0.0    0.0   

         1x9  ...  28x19  28x20  28x21  28x22  28x23  28x24  28x25  28x26  \
count  128.0  ...  128.0  128.0  128.0  128.0  128.

In [11]:
# Display shapes
for name, df in datasets.items():
    print(f"{name} Shape: {df.shape}")


Train Data Shape: (128, 785)
Train Labels Shape: (128, 2)
Train Embeddings Shape: (128, 128)
Test Data Shape: (10000, 785)
Test Embeddings Shape: (10000, 128)


In [12]:
# Check for missing values
for name, df in datasets.items():
    print(f"\nMissing Values in {name}:")
    print(df.isnull().sum())



Missing Values in Train Data:
label    0
1x1      0
1x2      0
1x3      0
1x4      0
        ..
28x24    0
28x25    0
28x26    0
28x27    0
28x28    0
Length: 785, dtype: int64

Missing Values in Train Labels:
Unnamed: 0    0
label         0
dtype: int64

Missing Values in Train Embeddings:
x_0      0
x_1      0
x_2      0
x_3      0
x_4      0
        ..
x_123    0
x_124    0
x_125    0
x_126    0
x_127    0
Length: 128, dtype: int64

Missing Values in Test Data:
label    0
1x1      0
1x2      0
1x3      0
1x4      0
        ..
28x24    0
28x25    0
28x26    0
28x27    0
28x28    0
Length: 785, dtype: int64

Missing Values in Test Embeddings:
x_0      0
x_1      0
x_2      0
x_3      0
x_4      0
        ..
x_123    0
x_124    0
x_125    0
x_126    0
x_127    0
Length: 128, dtype: int64


In [33]:
import pandas as pd
import numpy as np

def print_file_info(file_name):
    print(f"\n--- {file_name} ---")
    df = pd.read_csv(file_name)
    print(f"Shape: {df.shape}")
    print("First 5 rows:")
    print(df.head())
    print(f"Data types:\n{df.dtypes}")

files = ['train_data.csv', 'train_embeddings.csv', 'train_labels.csv', 'test_data.csv', 'test_embeddings.csv']

for file in files:
    print_file_info(file)


--- train_data.csv ---
Shape: (128, 785)
First 5 rows:
   label  1x1  1x2  1x3  1x4  1x5  1x6  1x7  1x8  1x9  ...  28x19  28x20  \
0      4    0    0    0    0    0    0    0    0    0  ...      0      0   
1      6    0    0    0    0    0    0    0    0    0  ...      0      0   
2      8    0    0    0    0    0    0    0    0    0  ...      0      0   
3      5    0    0    0    0    0    0    0    0    0  ...      0      0   
4      9    0    0    0    0    0    0    0    0    0  ...      0      0   

   28x21  28x22  28x23  28x24  28x25  28x26  28x27  28x28  
0      0      0      0      0      0      0      0      0  
1      0      0      0      0      0      0      0      0  
2      0      0      0      0      0      0      0      0  
3      0      0      0      0      0      0      0      0  
4      0      0      0      0      0      0      0      0  

[5 rows x 785 columns]
Data types:
label    int64
1x1      int64
1x2      int64
1x3      int64
1x4      int64
         ...  
2

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Load the data
train_data = pd.read_csv('train_data.csv').iloc[:, 1:].values  # Exclude the label column
train_embeddings = pd.read_csv('train_embeddings.csv').values
train_labels = pd.read_csv('train_labels.csv')['label'].values
test_data = pd.read_csv('test_data.csv').iloc[:, 1:].values  # Exclude the label column
test_embeddings = pd.read_csv('test_embeddings.csv').values

# Normalize the data
train_data = train_data / np.linalg.norm(train_data, axis=1, keepdims=True)
train_embeddings = train_embeddings / np.linalg.norm(train_embeddings, axis=1, keepdims=True)
test_data = test_data / np.linalg.norm(test_data, axis=1, keepdims=True)
test_embeddings = test_embeddings / np.linalg.norm(test_embeddings, axis=1, keepdims=True)

# Calculate cosine similarities
test_data_to_train_data = cosine_similarity(test_data, train_data)
test_embeddings_to_train_embeddings = cosine_similarity(test_embeddings, train_embeddings)

# Find matches
matches = []
for i in range(len(test_data)):
    test_data_neighbors = np.argsort(test_data_to_train_data[i])[-5:]  # Top 5 neighbors
    test_embedding_neighbors = np.argsort(test_embeddings_to_train_embeddings[i])[-5:]  # Top 5 neighbors
    
    best_match = None
    best_score = -1
    for j in range(len(test_embeddings)):
        score = np.sum(np.isin(train_labels[test_data_neighbors], train_labels[test_embedding_neighbors]))
        if score > best_score:
            best_score = score
            best_match = j
    
    matches.append(best_match)

# Create submission file
submission = pd.DataFrame({
    'row ID': range(len(matches)),
    'label': matches
})
submission.to_csv('submission.csv', index=False)

print("Submission file created successfully!")