#MA707 Metrics & Models
%md ## References
- [sklearn.metrics](https://scikit-learn.org/stable/modules/classes.html#sklearn-metrics-metrics)
%md Text and links below from Scikit-learn reference link above: 
- [metrics.explained_variance_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.explained_variance_score.html#sklearn.metrics.explained_variance_score): 
Explained variance regression score function
- [metrics.mean_absolute_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html#sklearn.metrics.mean_absolute_error): 
Mean absolute error regression loss
- [metrics.mean_squared_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html#sklearn.metrics.mean_squared_error): 
Mean squared error regression loss
- [metrics.mean_squared_log_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_log_error.html#sklearn.metrics.mean_squared_log_error): 
Mean squared logarithmic error regression loss
- [metrics.median_absolute_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.median_absolute_error.html#sklearn.metrics.median_absolute_error): 
Median absolute error regression loss
- [metrics.r2_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.r2_score.html#sklearn.metrics.r2_score): 
R^2 (coefficient of determination) regression score function.
%run "../Report/0.2 Feature creation (inc)"
%python
import pandas as pd
bci_pdf = pd.read_csv('/dbfs/mnt/group-ma707/data/5tc_plus_ind_vars.csv') \
            .rename(columns={'P3A~IV':'P3A_IV'}) \
            .assign(date=lambda pdf: pd.to_datetime(pdf.Date)) \
            .drop('Date', axis=1) \
            .sort_index(ascending=True)
bci_pdf.columns = bci_pdf.columns.str.lower()
bci_pdf.info() # 1602 non-null for all vars same count for index
%python
import numpy as np
import pandas as pd
coal_pdf = \
pd.read_csv('/dbfs/mnt/group-ma707/data/mining_com_coal.csv', 
            encoding='ISO-8859-1'
           ) \
  .loc[:,['date','tags','title','content']] \
  .fillna({'tags'   :'',
           'content':'',
           'title'  :''
          }) \
  .assign(date   =lambda pdf: pd.to_datetime(pd.to_datetime(pdf.date).dt.date)) \
  .groupby(by='date') \
  .agg({'tags'   : lambda ser: ' '.join(ser),
        'content': lambda ser: ' '.join(ser),
        'title'  : lambda ser: ' '.join(ser)}) \
  .resample('D') \
  .pad() \
  .reset_index()
coal_pdf.info()
%python
import numpy as np
import pandas as pd
ore_pdf = \
pd.read_csv('/dbfs/mnt/group-ma707/data/mining_com_iron_ore.csv', 
            encoding='ISO-8859-1'
           ) \
  .loc[:,['date','tags','title','content']] \
  .fillna({'tags'   :'',
           'content':'',
           'title'  :''
          }) \
  .assign(date = lambda pdf: pd.to_datetime(pd.to_datetime(pdf.date,utc=True).dt.normalize().dt.date)) \
  .groupby(by='date') \
  .agg({'tags'   : lambda ser: ' '.join(ser),
        'content': lambda ser: ' '.join(ser),
        'title'  : lambda ser: ' '.join(ser)}) \
  .resample('D') \
  .pad() \
  .reset_index()
ore_pdf.info(10)
%python
import pandas as pd
bci_coal_pdf = \
pd.concat(objs=[ bci_pdf.set_index('date'), 
                coal_pdf.set_index('date')], 
          join='inner',
          axis=1
         ) \
  .reset_index()
bci_coal_pdf.info()
%python
import pandas as pd
bci_ironore_pdf = \
pd.concat(objs=[bci_pdf.set_index('date'), 
                ore_pdf.set_index('date')], 
          join='inner',
          axis=1
         ) \
  .reset_index()
bci_ironore_pdf.info()
%python
import pandas as pd
bci_dual_pdf = \
pd.concat(objs=[bci_coal_pdf.set_index('date'), 
                ore_pdf.set_index('date')], 
          join='inner',
          axis=1
         ) \
  .reset_index()
bci_dual_pdf.info()
%python
import pandas as pd
bci_dual_pdf = \
pd.concat(objs=[ bci_pdf.set_index('date'), 
                 ore_pdf.set_index('date'),
                coal_pdf.set_index('date')], 
          join='inner',
          axis=1
         ) \
  .reset_index()
bci_dual_pdf.info()

%python 
def get_count_vect_all_three_plus_all_ts_pipe():
  from sklearn.pipeline import FeatureUnion, Pipeline
  return Pipeline(steps=[
    ('fea_one', FeatureUnionDF(transformer_list=[
      ('tgt_var'    ,CreateTargetVarDF(var='bci_5tc')),
      ('dt_vars'    ,CreateDatetimeVarsDF(var='date')),
      ('lag_txt_vars',CreateLagVarsDF(var_list=['tags','content','title'],
                                      lag_list=[3])),
    ])),
    ('drop_na_rows'  ,DropNaRowsDF(how='any')),
    ('fea_two', FeatureUnionDF(transformer_list=[
      ('named_vars' ,CreateNamedVarsDF(except_list=['tags_lag3','content_lag3','title_lag3'])),
      ('cnt_tags'   , CountVectColDF(col_name=   'tags_lag3',prefix='cnt_tags_'   ,add_stop_words=[])),
      ('cnt_content', CountVectColDF(col_name='content_lag3',prefix='cnt_content_',add_stop_words=[])),  
      ('cnt_title'  , CountVectColDF(col_name=  'title_lag3',prefix='cnt_title_'  ,add_stop_words=[])),  
    ])),
    ('drop_na_rows_again'  ,DropNaRowsDF(how='any')),
  ])
%md Ask about this error - creating new feature target pipelines is simple once we determine why this isnt working - because its just about changing what we count, and what we look at in terms of lagged variables, tfidf vectorizer, etc.
get_count_vect_all_three_plus_all_ts_pipe() \
  .fit(bci_coal_pdf) \
  .transform(bci_coal_pdf)