In [1]:
import time
import pandas as pd
import altair as alt
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import OneHotEncoder, PowerTransformer
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit

from xgboost import XGBRegressor

# import other functions
from imputer import *
from feature_eng import *
from drop import *


In [2]:
df = pd.read_csv('../data/train_data.zip')

In [4]:
df['unacast_session_count'].isnull().sum()

610

In [21]:
df.shape

(50100, 861)

In [22]:
df = df.iloc[100:2000,:]

In [27]:
df.shape

(1878, 861)

In [24]:
# drop rows missing target variable
df = drop_missing_unacast(df)
### until Sirine's PR gets merged
df = df.query("external_id != 'CA00070678'")
# create X and y
X = df.drop('unacast_session_count', axis=1)
y = df.loc[:, 'unacast_session_count']
# split the data
X_train, X_valid, y_train, y_valid = train_test_split(X, y, 
                                                    test_size=0.2,
                                                      random_state=2020)
# impute NaN values
result = impute_data(X_train, X_valid)
X_train = result[0]
X_valid = result[1] 
# perform feature eng
X_train = comb_cols(X_train)
X_valid = comb_cols(X_valid)
# perform dropping
X_train = drop_columns(X_train)
X_valid = drop_columns(X_valid)

In [25]:
X_train_valid = clean_categorical(X_train, X_valid)
X_train = X_train_valid[0]
X_valid = X_train_valid[1]

In [28]:
X_train.shape

(1502, 632)

In [None]:
tscv = TimeSeriesSplit()
>>> print(tscv)
TimeSeriesSplit(max_train_size=None, n_splits=5)
>>> for train_index, test_index in tscv.split(X):
...     print("TRAIN:", train_index, "TEST:", test_index)
...     X_train, X_test = X[train_index], X[test_index]
...     y_train, y_test = y[train_index], y[test_index]


In [29]:
tscv = TimeSeriesSplit()

In [30]:
print(tscv)

TimeSeriesSplit(max_train_size=None, n_splits=5)


In [15]:
def clean_categorical(input_data, categoricals=['income_class', 'density_class', 'climate']):
    """
    Given the original dataframe, uses One-Hot-Encoding to encode the categorical variables
    
    
    Parameters
    ----------
    input_data : pandas.core.frame.DataFrame
    to_drop : list
        The list of the categorical variables on which we want to apply OHE
    
    Returns
    -------
    output_data : pandas.core.frame.DataFrame
    
    """
    
    output_data = input_data.copy()

    #Apply One-Hot-Encoding to each one of the categorical variable
    ohe = OneHotEncoder(sparse=False, dtype=int)
    sub_df = pd.DataFrame(ohe.fit_transform(input_data[categoricals]), columns=ohe.categories_[0])
    output_data = pd.concat((output_data, sub_df), axis=1)
    #Drop the columns for which we used OHE
    output_data.drop(columns = to_drop, inplace=True)
    
    #Check that the number of rows is unchanged
    assert input_data.shape[0] == output_data.shape[0]
    
    #Check that `income_class` column is not in `output_data`
    assert 'income_class' not in output_data.columns.to_list()
    
    return output_data