## MeLi Data Challenge

This notebook is a quick start where we are going to preprocess and merge the original data to a more "pandas-like" format. Intermediate datasets will be saved to pickle files


In [None]:
import os
from datetime import datetime
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
train_data = pd.read_json("/kaggle/input/meli-data-challenge-2020/train_dataset.jl", lines=True, orient='columns')
train_data.head(2)

In [None]:
# Adapted from https://github.com/Santivg/ml-challenge

def preprocess_hist(df):
    df['user_view']= pd.Series(dtype='object')
    df['timestamps']= pd.Series(dtype='object')
    df['user_search']= pd.Series(dtype='object')
    df['search_timestamps']= pd.Series(dtype='object')

    for i in df.index:
        lista_view=[]
        lista_time=[]
        lista_search=[]
        lista_search_t=[]
        for item in df.user_history[i]:
            if item['event_type'] =='view':
                lista_view.append(int(item['event_info']))
                time_string=item['event_timestamp'].replace("T", " ").split('.')[0]
                timestamp=datetime.timestamp(datetime.strptime(time_string, '%Y-%m-%d %H:%M:%S'))
                lista_time.append(int(timestamp))
            if item['event_type'] =='search':
                lista_search.append(item['event_info'])
                time_string=item['event_timestamp'].replace("T", " ").split('.')[0]
                timestamp=datetime.timestamp(datetime.strptime(time_string, '%Y-%m-%d %H:%M:%S'))
                lista_search_t.append(int(timestamp))

        df.at[i,'user_view']= lista_view
        df.at[i,'timestamps']= lista_time

        df.at[i,'user_search']= lista_search
        df.at[i,'search_timestamps']= lista_search_t
    return df

In [None]:
# train part
train_data = preprocess_hist(train_data)
train_data.drop('user_history', axis=1, inplace=True)
train_data.to_pickle('train.pickle')
train_data.head(4)

**item_bought:** ID for the purchased item

**user_view:** List of IDs for each item that the user has seen in the last week

**timestamps:** List of timestamps for each visualization

**user_search:** List of search tokens in the last week

**search_timestamps:** List of timestamps for each search

In [None]:
# same for the test set
test_data = pd.read_json("/kaggle/input/meli-data-challenge-2020/test_dataset.jl", lines=True, orient='columns')
test_data = preprocess_hist(test_data)
test_data.drop('user_history', axis=1, inplace=True)
test_data.to_pickle('test.pickle')
test_data.head(2)

In [None]:
print(train_data.shape, test_data.shape)

## Products

In [None]:
item_data = pd.read_json("/kaggle/input/meli-data-challenge-2020/item_data.jl", lines=True, orient='columns')
item_data.to_pickle('products.pickle')
item_data.head(2)

## Join item_bought metadata (only available for training set)

In [None]:
rename_dict = {
    'title': 'bought_title', 'domain_id': 'bought_domain_id', 'price': 'bought_price',
    'category_id': 'bought_category_id', 'condition': 'bought_condition'
}

# remove/change two columns from item data
item_data['condition'] = item_data.condition.map({'new': 0, 'used': 1, 'not_specified': 2})
item_data.drop('product_id', axis=1, inplace=True)

# merge on item_bought id
train_data = train_data.merge(item_data, left_on='item_bought', right_on='item_id', how='left')
train_data = train_data.drop('item_id', axis=1).rename(rename_dict, axis=1)
train_data.head(2)

## Melt data and merge user_view data

Melt the list of products that each user has seen (user_view column) and merge the metadata for the product.

In [None]:
def melt_views(df):
    views = df[[c for c in df.columns if c != 'user_search' and c != 'search_timestamps']]
    views = views.reset_index().rename({'index': 'row_id'}, axis=1)

    views = views.set_index(['row_id']).apply(pd.Series.explode).reset_index()
    views = views[~views.user_view.isna()]  # remove purchases with no previous views
    return views.merge(item_data, left_on='user_view', right_on='item_id', how='left').drop('item_id', axis=1)

views_data = melt_views(train_data)
views_data.head(3)

In [None]:
views_data.to_pickle('train_views.pickle')

## Data Exploration

In [None]:
print("Corpus size (number of itens):", item_data.item_id.nunique())
print("Number of domains:", item_data.domain_id.nunique())
print("Number of categories:", item_data.category_id.nunique())
print("\nTop domains:")
print(item_data.domain_id.value_counts().head())

There are 2.1 million items in 7.894 domains and 11.493 categories

In [None]:
items_count = views_data.groupby('row_id')['user_view'].count()
plt.figure(figsize=(10, 4))
plt.title("Views distribution number for user")
p = sns.distplot(items_count, bins=100)

In [None]:
views_data['same_item'] = (views_data.item_bought == views_data.user_view).astype('int8')
same_item = views_data.groupby('row_id')['same_item'].max()
plt.title("Has seen the item before buying it? (item_bought is in user_view list)")
p = sns.countplot(same_item)

Hope this notebook helps as a quick start for this dataset.