In [None]:
import numpy as np
import pandas as pd
import pandas_profiling
import category_encoders
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, cross_val_predict, GridSearchCV, cross_validate
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score

# 1 Read data
- read from csv, json or database into dataframes
- create pandas profiles

In [None]:
df1 = pd.read_csv()
df2 = pd.read_pickle()
df3 = pd.read_json()

In [None]:
# check data profiles and dicide about categoricals, missing values, ...
df1.describe()
profile = df1.profile_report(title='df1 report')
profile.to_file(output_file="df1_profile.html")

# 2 Clean data
For each dataframe:
- handle categoricals
- remove unwanted rows 
- remove unwanted columns

In [None]:
# remove unwanted categories (seperately do it for each categorical columns)
df1.loc[~df1.col1.isin(['a','b','c']),'col1'] = 'others'

# convert to pandas categorical first, and then get its codes
categorical_columns = ['col1','col2']
for col in categorical_columns:
    df1[col] = fd1[col].astype('category').cat.codes

# use any encoding methods (e.g. one-hot encoding or hashing) if needed
selected_columns =  ['col1']
ohe = category_encoders.OneHotEncoder(cols=selected_columns)
df1 = ohe.fit_transform(df1)

# 3 Feature engineering
- merge dataframes &rarr; `data`
- handle missing values
- add extra features &rarr; `features`

In [None]:
# merge (join)
data = pd.merge(df1, df2, on='col1', how='left')

In [None]:
# remove rows or columns with lots of missing values 
# impute missing values with mean or something else
selected_columns = ['col1', 'col2']
for col in selected_columns:
    data.col.fillna(data.col.mean(), inplace=True)

In [None]:
# feature engineering
features = data.copy()
features['new_col'] = features.col1 / features.col2

In [None]:
features = features.dropna().reset_index(drop=True)

# 4 Split train-test
- output: `train` `test`

In [None]:
# split
train, test = train_test_split(features, test_size=0.2)

# 5 Train models
For each model define a new method with the following style (pay attention to normalization):
```python
# psuedocode
def train_pipeline(train, model, scaler):
    train_features = [...]
    label = ...
    xtrain = train[train_features]
    ytrain = train[label]
    pipeline = Pipeline([('transformer', scalar), ('estimator', model)])
    print('cv score: ', cross_val_score(pipeline, xtrain, ytrain, cv=5).mean())
    train_pred_cv = cross_val_predict(model, xtrain, ytrain, cv=5)
    model.fit(xtrain, ytrain)
    return model, train_pred_cv

```

In [None]:
model1, train_predictions_1 = train_pipeline(train, model=RandomForrestRegressor(), scaler=RobustScaler())
model2, train_predictions_2 = train_pipeline(train, model=KNNRegressor(), scaler=StandardScaler())

# 6 Stack models

In [None]:
stacked_features = pd.DataFrame(
    {'model1': train_predictions_1, 'model2': train_predictions_2, label: train.label}
)

In [None]:

stacked_model, train_pred_stack = train_pipline(stacked_features, LinearRegression(), scaler=None)

# 7 Evaluate on test

In [None]:
test_pred = stacked_model.predict(xtest)

In [None]:
r2_score(ytest, test_pred)