## 1.1 BCI dataset

In [2]:
%python
import pandas as pd
bci_pdf = pd.read_csv('/dbfs/mnt/group-ma707/data/5tc_plus_ind_vars.csv') \
            .rename(columns={'P3A~IV':'P3A_IV'}) \
            .assign(date=lambda pdf: pd.to_datetime(pdf.Date)) \
            .drop('Date', axis=1) \
            .sort_index(ascending=True)
bci_pdf.columns = bci_pdf.columns.str.lower()
bci_pdf.info()

## 1.2 Coal dataset

In [4]:
%python
import numpy as np
import pandas as pd
coal_pdf = \
pd.read_csv('/dbfs/mnt/group-ma707/data/mining_com_coal.csv', 
            encoding='ISO-8859-1'
           ) \
  .dropna(axis=0, subset=['tags','content','title']) \
  .assign(date   =lambda pdf: pd.to_datetime(pd.to_datetime(pdf.date).dt.date)
         ) \
  .loc[:,['date','tags','title','content']] \
  .sort_values('date', ascending=True)
coal_pdf.info()

In [5]:
%python 
dup_date_ser = \
coal_pdf['date'] \
  .value_counts() \
  .sort_index(ascending=True) \
  .loc[lambda ser:ser==2] 
dup_date_ser \
  .tail()

In [6]:
dup_date = dup_date_ser.index.date[-1]
dup_date

In [7]:
%python coal_pdf.set_index('date').loc[dup_date].tags

In [8]:
%python 
coal_pdf.groupby(by='date').sum().loc[dup_date].tags

Problem above. Fix below.

Put space at the end of each value in the `tags`, `title` and `content` columns. See code cell below.

In [10]:
%python
import numpy as np
import pandas as pd
coal_pdf = \
pd.read_csv('/dbfs/mnt/group-ma707/data/mining_com_coal.csv', 
            encoding='ISO-8859-1'
           ) \
  .dropna(axis=0, subset=['tags','content','title']) \
  .assign(date   =lambda pdf: pd.to_datetime(pd.to_datetime(pdf.date).dt.date),
          tags   =lambda pdf: pd.Series(data=[tags   +' ' for tags    in list(pdf.tags)]),
          content=lambda pdf: pd.Series(data=[content+' ' for content in list(pdf.content)]),
          title  =lambda pdf: pd.Series(data=[title  +' ' for title   in list(pdf.title)])
         ) \
  .loc[:,['date','tags','title','content']] \
  .sort_values('date', ascending=True)
coal_pdf.info()

In [11]:
%python coal_pdf.set_index('date').loc[dup_date].tags

In [12]:
%python 
coal_pdf.groupby(by='date').sum().loc[dup_date].tags

In [13]:
%python
import numpy as np
import pandas as pd
coal_pdf = \
pd.read_csv('/dbfs/mnt/group-ma707/data/mining_com_coal.csv', 
            encoding='ISO-8859-1'
           ) \
  .dropna(axis=0, subset=['tags','content','title']) \
  .assign(date   =lambda pdf: pd.to_datetime(pd.to_datetime(pdf.date).dt.date),
          tags   =lambda pdf: pd.Series(data=[tags   +' ' for tags    in list(pdf.tags)]),
          content=lambda pdf: pd.Series(data=[content+' ' for content in list(pdf.content)]),
          title  =lambda pdf: pd.Series(data=[title  +' ' for title   in list(pdf.title)])
         ) \
  .loc[:,['date','tags','title','content']] \
  .groupby(by='date') \
  .sum() \
  .reset_index() \
  .sort_values('date', ascending=True)
coal_pdf.info()

In [14]:
%python 
coal_pdf['date'] \
  .value_counts() \
  .sort_index(ascending=True) \
  .value_counts()

In [15]:
coal_pdf.date.dt.weekday.value_counts()

In [16]:
%python 
coal_pdf.set_index('date').sort_index(ascending=True).head(10) # debug

In [17]:
%python 
coal_pdf.set_index('date').sort_index(ascending=True).tags[-1] #.tail(10) # debug

In [18]:
%python 
coal_pdf.set_index('date').resample('D').pad().sort_index(ascending=True).tail(20)

In [19]:
%python 
coal_pdf.set_index('date').resample('D').pad().reset_index().date.dt.weekday.value_counts()

In [20]:
%python
import numpy as np
import pandas as pd
coal_pdf = \
pd.read_csv('/dbfs/mnt/group-ma707/data/mining_com_coal.csv', 
            encoding='ISO-8859-1'
           ) \
  .dropna(axis=0, subset=['tags','content','title']) \
  .assign(date   =lambda pdf: pd.to_datetime(pd.to_datetime(pdf.date).dt.date),
          tags   =lambda pdf: pd.Series(data=[tags   +' ' for tags    in list(pdf.tags)]),
          content=lambda pdf: pd.Series(data=[content+' ' for content in list(pdf.content)]),
          title  =lambda pdf: pd.Series(data=[title  +' ' for title   in list(pdf.title)])
         ) \
  .loc[:,['date','tags','title','content']] \
  .groupby(by='date') \
  .sum() \
  .resample('D') \
  .pad() \
  .reset_index() \
  .sort_values('date', ascending=True) \
  .add_suffix('_coal') \
  .rename(columns={"date_coal": "date"})
coal_pdf.info()

## 1.3 Iron ore dataset

In [22]:
%python
import numpy as np
import pandas as pd
ore_pdf = \
pd.read_csv('/dbfs/mnt/group-ma707/data/mining_com_iron_ore.csv', 
            encoding='ISO-8859-1'
           ) \
  .loc[:,['date','tags','title','content']] \
  .fillna({'tags'   :'',
           'content':'',
           'title'  :''
          }) \
  .assign(date = lambda pdf: pd.to_datetime(pd.to_datetime(pdf.date,utc=True).dt.normalize().dt.date)) \
  .groupby(by='date') \
  .agg({'tags'   : lambda ser: ' '.join(ser),
        'content': lambda ser: ' '.join(ser),
        'title'  : lambda ser: ' '.join(ser)}) \
  .resample('D') \
  .pad() \
  .reset_index() \
  .add_suffix('_ore') \
  .rename(columns={"date_ore": "date"})
ore_pdf.info(10)

In [23]:
coal_pdf.info()

In [24]:
bci_pdf.info()

In [25]:
ore_pdf.info()

## 1.4 Initial dataset (TBD)

The "initial dataset" is created by merging the BCI and the two mining datasets.
Below the coal and bci datasets are merged to create the `bci_coal_pdf` datasets/dataframe.

In [27]:
%python
import pandas as pd
bci_coal_pdf = \
pd.concat(objs=[ bci_pdf.set_index('date'), 
                coal_pdf.set_index('date')], 
          join='inner',
          axis=1
         ) \
  .reset_index()
bci_coal_pdf.info()

## 1.5 Initial dataset (TBD)

The "initial dataset" is created by merging the BCI and the two mining datasets.
Below the ironore and bci datasets are merged to create the `bci_ironore_pdf` datasets/dataframe.

In [29]:
%python
import pandas as pd
bci_ironore_pdf = \
pd.concat(objs=[bci_pdf.set_index('date'), 
                ore_pdf.set_index('date')], 
          join='inner',
          axis=1
         ) \
  .reset_index()
bci_ironore_pdf.info()

## 1.6 Initial dataset (TBD)

The "initial dataset" is created by merging the BCI and the two mining datasets.
Below the bci_pdf, coal_pdf and ore_pdf datasets are merged to create the `bci_all_dual_pdf` datasets/dataframe.

In [31]:
bci_dual_pdf = \
pd.concat(objs=[ bci_pdf.set_index('date'), 
                 ore_pdf.set_index('date'),
                 coal_pdf.set_index('date')], 
          join='inner',
          axis=1
         ) \
  .reset_index()
bci_dual_pdf.info()