#### Notebook purpose
This notebook aims to split training and test datasets

#### Read and prepare data

In [1]:
import pandas as pd
from ast import literal_eval

In [2]:
data_path = '../../data/'
models_path = '../../models/'

In [3]:
# add labels sports = 2, politics = 1, other = 0
sports_df = pd.read_csv(f'{data_path}sports_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 2)
politics_df = pd.read_csv(f'{data_path}politics_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 1)
other_df = pd.read_csv(f'{data_path}other_10k.csv', converters={'event_list': literal_eval, 'person_list': literal_eval}).assign(y=lambda x: 0)

In [4]:
# we'll use this lookup to filter articles that are from both caetories politics & sports (6 cases)
intersection = sports_df.merge(politics_df, on='id')[['id']].assign(intersection=lambda x: 1) 

In [5]:
# concatenate the whole sample of articles
df = pd.concat([sports_df.merge(intersection, how='left').query('intersection != intersection').drop('intersection', axis=1),
                politics_df.merge(intersection, how='left').query('intersection != intersection').drop('intersection', axis=1),
                other_df],
               axis=0)[['title', 'body', 'source', 'event_list', 'person_list', 'y']].assign(dummy=lambda x: 1) # dummy variable

In [6]:
# we keep only the first 250 characters of each article body and fill missing values (few titles and sources are missing)
df.body = df.body.map(lambda x: x[:250])
df = df.fillna('missing')

#### Split training and test datasets and save results

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), 
                                                    df[['y']], 
                                                    test_size=0.33, 
                                                    random_state=42, 
                                                    shuffle=True, stratify=df.y)

In [9]:
X_train.to_csv(f'{data_path}X_train.csv', index=False)
X_test.to_csv(f'{data_path}X_test.csv', index=False)
y_train.to_csv(f'{data_path}y_train.csv', index=False)
y_test.to_csv(f'{data_path}y_test.csv', index=False)