In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv(
    "../input/tabular-playground-series-jan-2021/train.csv"
)
test = pd.read_csv(
    "../input/tabular-playground-series-jan-2021/test.csv"
)

In [None]:
from itertools import combinations
from sklearn.feature_selection import mutual_info_regression
import pickle

class Create_Features:
    def __init__(self):
        self.features_added = []
        self.feature_bins = {}
        self.bin_labels = {}
        pass
        
    def find_interaction_features(self, df, target, sample_size = 0.5):
        # Getting correct feature list for combinations
        feature_list = df.columns.tolist()
        
        print("Calculating Score Threshold...")
        mi = self.score_feature(df, target, sample_size)
        threshold = (mi.max() - mi.min()) / 1.5 + mi.min()
        threshold = mi.max()
        print(f'Scores: {mi}, Score Threshold: {threshold}')
        print()
        
        ff = pd.DataFrame()
        for combo in combinations(feature_list, 2):
            # Addition
            feature_name = f'{combo[0]}_+_{combo[1]}'
            print(f'Trying {feature_name}...', end = '\r')
            add = df[combo[0]] + df[combo[1]]
            score = self.score_feature(add, target, sample_size)
            if score > threshold:
                ff[feature_name] = add
                print(f'Added {feature_name}: {score}')
                self.features_added.append(feature_name)
                
            # Multiplication
            feature_name = f'{combo[0]}_x_{combo[1]}'
            print(f'Trying {feature_name}...', end = '\r')
            mult = df[combo[0]] * df[combo[1]]
            score = self.score_feature(mult, target, sample_size)
            if score > threshold:
                ff[feature_name] = mult
                print(f'Added {feature_name}: {score}')
                self.features_added.append(feature_name)
                
            # Subtraction
            feature_name = f'{combo[0]}_-_{combo[1]}'
            print(f'Trying {feature_name}...', end = '\r')
            sub = df[combo[0]] - df[combo[1]]
            score = self.score_feature(sub, target, sample_size)
            if score > threshold:
                ff[feature_name] = sub
                print(f'Added {feature_name}: {score}')
                self.features_added.append(feature_name)
                
            # Divison
            feature_name = f'{combo[0]}_/_{combo[1]}'
            print(f'Trying {feature_name}...', end = '\r')
            div = df[combo[0]] / df[combo[1]]
            score = self.score_feature(div, target, sample_size)
            if score > threshold:
                ff[feature_name] = div
                print(f'Added {feature_name}: {score}')
                self.features_added.append(feature_name)
        
        print(f'Finished -- Total Features Added: {len(self.features_added)}', end = '\r')
        return pd.concat([df,ff], axis = 1)
    
    def interaction_transform(self, df):
        ff = pd.DataFrame()
        for feature in self.features_added:
            interaction = feature.split("_")
            if interaction[1] == '+':
                ff[feature] = df[interaction[0]] + df[interaction[2]]
            elif interaction[1] == 'x':
                ff[feature] = df[interaction[0]] * df[interaction[2]]
            elif interaction[1] == '-':
                ff[feature] = df[interaction[0]] - df[interaction[2]]
            else:
                ff[feature] = df[interaction[0]] / df[interaction[2]]
        return pd.concat([df,ff], axis = 1)
    
    def load_interactions(self, interactions, filepath, display_features = False):
        with open(filepath, 'rb') as fp:
            self.features_added = pickle.load(fp)
            print(f'Loaded interaction features. Please use interaction_transform() to apply interactions.')
        if display_features:
            print(f'Features Loaded: {self.features_added}')
    
    def save_interactions(self, filepath):
        with open(filepath, 'wb') as fp:
            pickle.dump(self.features_added, fp)
            print(f'Features have been saved at {filepath}')
        
    
    def score_feature(self, feature, target, sample_size = 0.5):
        subset = feature.sample(frac = sample_size)
        sample_target = target[subset.index]
        if type(subset) == type(pd.DataFrame()):
            return mutual_info_regression(subset, sample_target)
        else:
            return mutual_info_regression(subset.values.reshape(-1,1), sample_target)
    
    def bin_features(self, df,features = 'all', n_bins = 10):
        """
        Creates the binned features and stores the bins for later use
        """
        if features == 'all':
            features = df.columns.tolist()
        binned_features = {}
        for col in features:
            self.bin_labels[col] = [i/n_bins for i in range(n_bins)]
            binned_features[f'{col}_bin'], self.feature_bins[col] = pd.qcut(df[col], q = n_bins, labels = self.bin_labels[col], retbins = True)
            binned_features[f'{col}_bin'] = binned_features[f'{col}_bin'].astype(float)
            
        bf = pd.DataFrame.from_dict(binned_features)
        return pd.concat([df, bf], axis = 1)
    
    def bin_transform(self, df, features = 'all'):
        """
        Uses the defined bins to bin another set of the same features
        """
        if features == 'all':
            features = df.columns.tolist()
        bf = {}
        for col in features:    
            bf[f'{col}_bin'] = pd.cut(df[col], bins = cf.feature_bins[col], labels = self.bin_labels[col])
            bf[f'{col}_bin'] = bf[f'{col}_bin'].astype(float)
        bf = pd.DataFrame.from_dict(bf)
        return pd.concat([df, bf], axis = 1)
    
    def flag_outliers(self, df):
        pass
    
    def outlier_transform(self, df):
        pass
            
    

In [None]:
X = train.iloc[:,1:-1]
y = train.iloc[:,-1]
X_val = test.iloc[:,1:]

In [None]:
cf = Create_Features()
X = cf.find_interaction_features(X, y, sample_size = 0.1)
X_val = cf.interaction_transform(X_val)

In [None]:
cf.bin_features(pd.concat([X, X_val], axis = 0), n_bins = 500)
X = cf.bin_transform(X)
X_val = cf.bin_transform(X_val)
X = X.fillna(-1)
X_val = X_val.fillna(-1)
X_val.info()

In [None]:
X.columns

In [None]:
drop = ['cont1', 'cont2', 'cont3', 'cont4', 'cont5', 'cont6', 'cont7', 'cont8',
       'cont9', 'cont10', 'cont11', 'cont12', 'cont13', 'cont14',
       'cont3_+_cont12', 'cont4_x_cont11', 'cont7_x_cont12']
X = X.drop(drop, axis = 1)
X_val = X_val.drop(drop, axis = 1)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import joblib
import xgboost as xg

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True) 

clf = xg.XGBRegressor(objective ='reg:squarederror', eta = .3).fit(X_train, y_train) 
  
y_pred = clf.predict(X_test)
rms = mean_squared_error(y_test, y_pred, squared = False)
rms

In [None]:
clf.feature_importances_

In [None]:
submission = pd.DataFrame()

submission['id'] = test['id']
submission['target'] = clf.predict(X_val)

submission.to_csv(
    "2021_01_27_xgb_allfeatures_500bins.csv",
    index= False
)

# Voting Ensembles

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xg
from sklearn.ensemble import VotingRegressor


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, shuffle = True) 

# Regressors
r1 = ElasticNet(alpha = 1.5)
r2 = RandomForestRegressor(n_estimators = 150, max_depth = 4)
r3 = xg.XGBRegressor(objective ='reg:squarederror',eta = 0.1, gamma = 2, max_depth = 4, alpha = 0.5) 
r4 = HistGradientBoostingRegressor(learning_rate = 0.1, max_depth = 4, l2_regularization = 0.5)

er = VotingRegressor([
    ('lr', r1), 
    ('rf', r2),
    ('xg', r3),
    ('gb', r4),
], weights = [1, 1, 1, 1])

er.fit(X_train, y_train)
y_pred = clf.predict(X_test)
rms = mean_squared_error(y_test, y_pred, squared = False)
rms

In [None]:
submission = pd.DataFrame()

submission['id'] = test['id']
submission['target'] = er.predict(X_val)

submission.to_csv(
    "2021_01_27_voting_withMoarReg_allfeatures.csv",
    index= False
)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize = (8,4))
sns.heatmap(X_val.isinf())
plt.show()

In [None]:
X_train.info()