In [3]:
# No words, just work => Goal: TOP #1 🍋

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Ignore any warnings: chill..
import warnings
warnings.filterwarnings('ignore')

# Natural Language Processing | Machine Learning models
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [5]:
# Training Datasets
df_articles = pd.read_csv('articles.csv')
df_lifes = pd.read_csv('life_situations.csv')
df_news = pd.read_csv('news.csv')
df_services = pd.read_csv('services.csv')

# Testing Datasets
df_test = pd.read_csv('epir_test.csv')

In [None]:
df_lifes.drop(columns=['subid'], inplace=True)
df_lifes.rename(columns={'URL': 'url'}, inplace=True)

def df_generalize(df, category):
    if 'content' in df.columns:
        df.rename(columns={'content': 'x'}, inplace=True)
    drop_columns = df.columns[3:-1]
    df['content'] = df.iloc[:, 3:-1].apply(lambda row: ' '.join(map(str, row)), axis=1)
    df.drop(columns=drop_columns, inplace=True)
    df.drop(['id'], axis=1, inplace=True)
    df.rename(columns={'Unnamed: 0': 'id'}, inplace=True)
    df = df.assign(**{'category': category})
    return df

In [None]:
df_news = df_generalize(df_news, 'news')
df_lifes = df_generalize(df_lifes, 'lifes')
df_articles = df_generalize(df_articles, 'articles')
df_services = df_generalize(df_services, 'services')

In [None]:
df_train = pd.concat([df_news, df_lifes, df_articles, df_services])

In [None]:
df_train = df_train.drop_duplicates(subset='url', keep='first')

In [None]:
df_train = df_train[df_train['sys_lang'].isin(['en', 'ru', 'qq', 'kk'])]

In [None]:
max_sequence_length = 100
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df_train['content'])

In [None]:
def summary(df):
    summary_df = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summary_df['missing#'] = df.isna().sum()
    summary_df['missing%'] = (df.isna().sum())/len(df)
    summary_df['uniques'] = df.nunique().values
    summary_df['count'] = df.count().values
    return summary_df

summary(df_train).style.background_gradient(cmap='Greens')

Unnamed: 0,dtypes,missing#,missing%,uniques,count
id,int64,0,0.0,207292,207292
sys_lang,object,0,0.0,46,207292
url,object,0,0.0,203993,207292
content,object,65376,0.315381,131730,141916
category,object,0,0.0,4,207292


In [None]:
def short_describe(df):
    rows, cols = df.shape
    col_names = ', '.join(df.columns.tolist())
    print(f'* Number of Rows: {rows}')
    print(f'* Number of Columns: {cols}')
    print(f'* Column names:\n {col_names}')

short_describe(df_train)

* Number of Rows: 207292
* Number of Columns: 5
* Column names:
 id, sys_lang, url, content, category


In [None]:
train_sequences = tokenizer.texts_to_sequences(df_train['content'])
test_sequences = tokenizer.texts_to_sequences(df_test['question'])

In [None]:
train_padded = pad_sequences(train_sequences, maxlen=max_sequence_length, padding='post')
test_padded = pad_sequences(test_sequences, maxlen=max_sequence_length, padding='post')

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_sequence_length),
    tf.keras.layers.LSTM(128),
    tf.keras.layers.Dense(100, activation='relu')
])

In [None]:
model.compile(optimizer='adam', loss='cosine_similarity')

In [None]:
model.fit(train_padded, train_padded, epochs=10)

In [None]:
similarity_scores = cosine_similarity(model.predict(test_padded), model.predict(train_padded))

In [None]:
top_indices = np.argmax(similarity_scores, axis=1)

In [None]:
submission = pd.DataFrame()
submission['id'] = df_test['id']
submission['index'] = top_indices

submission.to_csv('submission.csv', index=False)
submission.head(10)