In [None]:
import numpy as np 
import pandas as pd 
from collections import Counter
import os
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load data

In [None]:
# Read train file
train_data = pd.read_csv("/kaggle/input/20-newsgroups-ciphertext-challenge/train.csv")
test_data = pd.read_csv("/kaggle/input/20-newsgroups-ciphertext-challenge/test.csv") 
cipher_train_data = train_data['ciphertext'].to_numpy()
labels = train_data['target'].to_numpy()
cipher_test_data = test_data['ciphertext'].to_numpy()
train_data.head()

In [None]:
# Number of labels
train_data['target'].unique()

## Problem

This is a classification problem with 20 labels


### I don't know, just try something basic, maybe character occurences are good features

In [None]:
# Cipher text example
train_data.loc[0,'ciphertext']

Probably, ';' is newline or space or nothing :))

In [None]:
## Count chacracter occurrences
cipher_example = train_data.loc[0,'ciphertext']
counter = Counter(cipher_example)
print(counter)

In [None]:
## Make the vocabulary
cipher_corpus = train_data['ciphertext'].to_numpy()
vocab = set([])
for cp in cipher_corpus:
    tmp = set(cp)
    vocab = vocab.union(tmp)
vocab = list(vocab)

In [None]:
## Convert cipher text to vector
def cp2vec(text,vocab):
    v = np.zeros(len(vocab))
    counter = Counter(text)
    for i in range(len(vocab)):
        v[i] = counter[vocab[i]]
    return v
cp2vec(cipher_example,vocab)

In [None]:
## Convert all ciphertexts to occurrence vectors
train_data_1 = np.zeros((len(cipher_train_data),len(vocab)))
for i in range(len(cipher_train_data)):
    train_data_1[i] = cp2vec(cipher_train_data[i],vocab)

In [None]:
## Why not use SVM to classify?
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(train_data_1, labels)

In [None]:
## Let's check :))
test_data_1 = np.zeros((len(cipher_test_data),len(vocab)))
for i in range(len(cipher_test_data)):
    test_data_1[i] = cp2vec(cipher_test_data[i],vocab)
predict_1 = clf.predict(test_data_1)

In [None]:
## First submission
submission_1 = {'Id': test_data['Id'].to_numpy(), 'Predicted': predict_1}
submission_df = pd.DataFrame(data=submission_1)
submission_df.to_csv('submission.csv',index=False)