In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pylab as plt
from tqdm.notebook import tqdm
import pickle
from sklearn.model_selection import GroupKFold
from scipy.stats import pearsonr
from sklearn import metrics
import lightgbm as lgb
from lightgbm import plot_importance
from sklearn.model_selection import train_test_split
import xgboost as xgb
#from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold 
import joblib
import warnings

warnings.filterwarnings("ignore")


In [None]:
# gpu only
import cudf
from cuml.linear_model import LinearRegression

In [None]:
# from https://www.kaggle.com/valleyzw/ubiquant-lgbm-baseline
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in tqdm(df.columns):
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16) #np.float16
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024 ** 2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [None]:
import gc
features = [f'f_{i}' for i in range(300)] 
add_feat =  ['investment_id'] + ['time_id']
DATA_ = (pd.read_parquet('../input/ubiquant-parquet-low-mem/train_low_mem.parquet', columns = features + add_feat + ['target']))
gc.collect()

In [None]:
outlier_list = []
outlier_col = []

for col in (f"f_{i}" for i in range(300)):
    _mean, _std = DATA_[col].mean(), DATA_[col].std()
    #print ('==>', _mean, _std)
    temp_df = DATA_.loc[(DATA_[col] > (_mean + _std * 70)) | (DATA_[col] < (_mean - _std * 70))]
    temp2_df = DATA_.loc[(DATA_[col] > (_mean + _std * 35)) | (DATA_[col] < (_mean - _std * 35))]
    if len(temp_df) >0 : 
        outliers = temp_df.index.to_list()
        outlier_list.extend(outliers)
        outlier_col.append(col)
        print(col, len(temp_df))
    elif len(temp2_df)>0 and len(temp2_df) <6 :
        outliers = temp2_df.index.to_list()
        outlier_list.extend(outliers)
        outlier_col.append(col)
        print(col, len(temp2_df))

outlier_list = list(set(outlier_list))
DATA_.drop(DATA_.index[outlier_list], inplace = True)
print(len(outlier_col), len(outlier_list), DATA_.shape)
gc.collect()

In [None]:
# # load data
# features = [f'f_{i}' for i in range(300)] + ['investment_id'] 
# PATH = '../input/ubiquant-parquet-low-mem'
# DATA_ = reduce_mem_usage(pd.read_parquet(f'{PATH}/train_low_mem.parquet', columns=features+['target']))
# #DATA =reduce_mem_usage(pd.read_csv(PATH + '/train.csv', nrows=1141410))
# for col in ['investment_id']:
#     DATA_[col] = DATA_[col].astype(int)
# print ('data loading done ', len(DATA_))
# gc.collect()

In [None]:
# load data
DATA_ = reduce_mem_usage(DATA_)
for col in ['investment_id','time_id']:
    DATA_[col] = DATA_[col].astype(int)
print ('data loading done ', len(DATA_))
gc.collect()

In [None]:
samples=1500000
if samples is not None:
    #train = train.sample(args.samples, random_state=args.seed).reset_index(drop=True)
    DATA = DATA_[-samples:].reset_index(drop=True)
    gc.collect()
DATA.shape

In [None]:

data_slice = int(0.95*len(DATA))
x_train = DATA[:data_slice][features+add_feat]
y_train = DATA[:data_slice]['target']
x_test = DATA[data_slice:][features+add_feat]
y_test = DATA[data_slice:]['target']
del DATA
gc.collect()



In [None]:
class BasicModel(object):
    """Parent class of basic models"""
    def train(self, x_train, y_train, x_val, y_val):
        """return a trained model and eval metric of validation data"""
        pass
    
    def predict(self, model, x_test,feature_importance=None):
        """return the predicted result of test data"""
        pass
    
    def get_oof(self, x_train, y_train, x_test, n_folds = 5):
        """K-fold stacking"""
        num_train, num_test = x_train.shape[0], x_test.shape[0]
        oof_train = np.zeros((num_train,)) 
        oof_test = np.zeros((num_test,))
        oof_test_all_fold = np.zeros((num_test, n_folds))
        aucs = []
        
       
        
        skf = StratifiedKFold(n_splits = n_folds ,shuffle = True, random_state = 2022)
        # KF = KFold(n_splits = n_folds, random_state=2017)
        
        feature_importance = [0]*301
        # split methods
        
        for i, (train_index, val_index) in enumerate(skf.split(x_train, x_train['time_id'])):
            print('{0} fold, train {1}, val {2}'.format(i, 
                                                        len(train_index),
                                                        len(val_index)))
            x_tra, y_tra = x_train.iloc[train_index], y_train.iloc[train_index]
            x_val, y_val = x_train.iloc[val_index], y_train.iloc[val_index]
            model, auc = self.train(x_tra, y_tra, x_val, y_val, i)
            aucs.append(auc)
            
            oof_train[val_index],feature_importance = self.predict(model, x_val, feature_importance)
            
            # scatter
            plt.figure(figsize=(20, 7), facecolor='#f6f5f5')
            plt.scatter(oof_train[val_index]-y_val, np.arange(len(y_val)))
            plt.show()
            
            print ('Pearsonr:',pearsonr(y_val, oof_train[val_index])[0])
            
            
            oof_test_all_fold[:, i], _ = self.predict(model, x_test)
            

            
            
            #print ('Pearsonr:',pearsonr(x_test, oof_test_all_fold[:, i])[0])
            del train_index, val_index, x_tra, y_tra, x_val, y_val, model
            gc.collect()
            
        oof_test = np.mean(oof_test_all_fold, axis=1)
        print('all aucs {0}, average {1}'.format(aucs, np.mean(aucs)))
        
        del x_train, y_train, x_test
        gc.collect()
        return oof_train, oof_test, feature_importance

In [None]:
class LinearRegressor(BasicModel):
    def __init__(self):
        """set parameters"""
        
    def train(self, x_train, y_train, x_val, y_val, fold):
        print('train with linear model')
        model = LinearRegression(fit_intercept = True, normalize = False, algorithm = "eig")
        reg = model.fit(x_train, y_train)
        
        joblib.dump(model, f'LinearRegression_{fold}.pkl')
        del x_train, y_train, x_val, y_val
        gc.collect()
        
        return model, 0

    def predict(self, model, x_test, feature_importance=None):
        print('test with linear model')
    

        # 显示重要特征
#         plt.figure(figsize=(16, 10))
#         plot_importance(model)
#         plt.show()
        return model.predict(x_test),feature_importance

In [None]:
class RAPIDSModel:
    def __init__(self):
        self.te = cuml.preprocessing.TargetEncoder()
        self.base_model = cuml.ensemble.RandomForestRegressor(n_estimators=256, split_criterion="mse", bootstrap=True,
                                                              max_samples=0.6, min_samples_leaf=64, max_features=0.6, n_bins=512)
        
    def fit(self, x_train, y_train):
        #train_df["investment_te"] = self.te.fit_transform(x_train["investment_id"], y_train).astype("float32")
        self.base_model.fit(x_train, y_train)
        return self
        
    def predict(self, test_df):
        #test_df["investment_te"] = self.te.transform(test_df["investment_id"]).astype("float32").get()
        return self.base_model.predict(test_df)

In [None]:
class RandomForest(BasicModel):
    def __init__(self):
        """set parameters"""
        
    def train(self, x_train, y_train, x_val, y_val, fold):
        print('train with randomForest model')
        model = RAPIDSModel().fit(x_train, y_train)
        
        
        joblib.dump(model, f'randomForest_{fold}.pkl')
        del x_train, y_train, x_val, y_val
        gc.collect()
        
        return model, 0

    def predict(self, model, x_test, feature_imprtance=None):
        print('test with randomForest model')
        return model.predict(x_test),feature_imprtance

In [None]:
from sklearn.metrics import mean_squared_error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def feval_rmse(y_pred, lgb_train):
    y_true = lgb_train.get_label()
    return 'rmse', rmse(y_true, y_pred), False

class LGBMRegressor(BasicModel):
    def __init__(self):
#         self.num_boost_round = 2000
        self.early_stopping_rounds = 50
        self.verbose = 100
        self.params = {
            'learning_rate':0.05,
            "objective": "regression",
            "metric": "rmse",
            'boosting_type': "gbdt",
            'verbosity': -1,
            'n_jobs': -1, 
            'seed': 2017,
            'lambda_l1': 0.03627602394442367, 
            'lambda_l2': 0.43523855951142926, 
            'num_leaves': 114, 
            'feature_fraction': 0.9505625064462319, 
            'bagging_fraction': 0.9785558707339647, 
            'bagging_freq': 7, 
            'max_depth': -1, 
            'max_bin': 501, 
            'min_data_in_leaf': 374,
            'n_estimators': 1000, 
            }
        
        

        
    def train(self, x_train, y_train, x_val, y_val,fold):
        print('train with lgb model')
        train_dataset = lgb.Dataset(x_train, y_train, categorical_feature=[])
        valid_dataset = lgb.Dataset(x_val, y_val, categorical_feature=[])

        model = lgb.train(
            self.params,
            train_set = train_dataset, 
            valid_sets = [train_dataset, valid_dataset], 
            verbose_eval=self.verbose,
            early_stopping_rounds=self.early_stopping_rounds,
            feval = feval_rmse
        )
        joblib.dump(model, f'LGBMRegressor_{fold}.pkl')
        
        #print (model.best_score['valid_1'])
        del x_train, y_train, x_val, y_val,train_dataset,valid_dataset
        gc.collect()
        return model, model.best_score['valid_1']['rmse']
    
    def predict(self, model, x_test, feature_importance=None):
        print('test with lgb model')

    
        if feature_importance is not None:
            # 显示重要特征
#             plt.rcParams["figure.figsize"]=(16, 10)
#             plot_importance(model,max_num_features=80)
#             plt.show()
            importance = model.feature_importance()
            feature_importance = [i + j for i, j in zip(feature_importance, importance)]
            
        
        
        return model.predict(x_test, num_iteration=model.best_iteration),feature_importance

In [None]:
from xgboost import plot_tree

def feat_dic2list (score_dict):
    importance = [0]*300
    for i in range(300):
        if f'f_{i}'in score_dict:
            importance[i] = score_dict[f'f_{i}']
    if 'investment_id' in score_dict:
        importance.append(score_dict['investment_id'])
    return importance
        
    

class XGBRegressor(BasicModel):
    def __init__(self):
        """set parameters"""
        #self.num_rounds=1000
        self.early_stopping_rounds = 10
        self.verbose = 1
        self.params = {
            'n_estimators':500,
            'learning_rate':0.05,
            'max_depth': 12,
            'subsample':0.9,
            'colsample_bytree':0.7,
            # colsample_bylevel=0.75,
            'missing':-999,
            'random_state':1111,
            'tree_method':"gpu_hist"
         }
        
        
    def train(self, x_train, y_train, x_val, y_val, fold):
        print('train with xgb model')
        
        dtrain = xgb.DMatrix(x_train, y_train)
        
        plst = list(self.params.items())
        
        train = xgb.DMatrix(data=x_train,
                            label=y_train)
        valid = xgb.DMatrix(data=x_val,
                            label=y_val)
     
        model = xgb.train(plst, dtrain, evals=[(train, 'train'), (valid, 'valid')], early_stopping_rounds = self.early_stopping_rounds)
        
        joblib.dump(model, f'XGBRegressor_{fold}.pkl')
        xgbval = xgb.DMatrix(x_val, y_val)
        del train,valid,x_train, y_train, x_val, y_val
        gc.collect()
        
        return model, float(model.eval(xgbval).split()[1].split(':')[1])

    def predict(self, model, x_test, feature_importance= None):
        print('test with xgb model')
        xgbtest = xgb.DMatrix(x_test)
        ans = model.predict(xgbtest)

        if feature_importance is not None:
#             # 显示重要特征
#             plt.figure(figsize=(16, 10))
#             plot_tree(model, max_depth=5)
#             plt.show()

            importance = feat_dic2list(model.get_score(importance_type='weight'))
            feature_importance = [i + j for i, j in zip(feature_importance, importance)]
            
        return ans, feature_importance

In [None]:
xgb_XGBRegressor = XGBRegressor()
xgb_oof_train, xgb_oof_test, feat_importance = xgb_XGBRegressor.get_oof(x_train, y_train, x_test)
del xgb_XGBRegressor
# feat_importance_df_xgb = cudf.DataFrame({'importance':feat_importance,'var':features})
gc.collect()

In [None]:

lgb_LGBMRegressor = LGBMRegressor()
lgb_oof_train, lgb_oof_test, feat_importance = lgb_LGBMRegressor.get_oof(x_train, y_train, x_test)
del lgb_LGBMRegressor
#feat_importance_df_lgb = cudf.DataFrame({'importance':feat_importance,'var':features})
gc.collect()


In [None]:
# feat_importance_df_lgb=feat_importance_df_lgb.sort_values(by='importance',ascending=True)
# feat_importance_df_xgb=feat_importance_df_xgb.sort_values(by='importance',ascending=True)
# idx1 = feat_importance_df_lgb[:100]['var'].to_arrow().to_pylist()
# idx2 = feat_importance_df_xgb[:100]['var'].to_arrow().to_pylist()
# x_train = x_train[list(set(idx1).union(set(idx2)))]
# x_test = x_test[list(set(idx1).union(set(idx2)))]
# del feat_importance_df_lgb,feat_importance_df_xgb
# gc.collect()

# list(set(idx1).union(set(idx2)))



In [None]:
# import cuml
# lr_LinearRegressor = LinearRegressor()
# lr_oof_train, lr_oof_test,_ = lr_LinearRegressor.get_oof(x_train, y_train, x_test)
# del lr_LinearRegressor
# gc.collect()

In [None]:
# import cuml
# rf_RandomForest = RandomForest()
# rf_oof_train, rf_oof_test,_ = rf_RandomForest.get_oof(x_train.astype('float32'), y_train, x_test.astype('float32'))
# del rf_RandomForest
# gc.collect()

In [None]:
input_train = [xgb_oof_train, lgb_oof_train] 
input_test = [xgb_oof_test, lgb_oof_test]

stacked_train = np.concatenate([f.reshape(-1, 1) for f in input_train], axis=1)
stacked_test = np.concatenate([f.reshape(-1, 1) for f in input_test], axis=1)
del xgb_oof_train, lgb_oof_train, xgb_oof_test, lgb_oof_test
gc.collect()


In [None]:
final_model = LinearRegression()
final_model.fit(stacked_train, y_train)
test_prediction = final_model.predict(stacked_test)

# test

plt.figure(figsize=(20, 7), facecolor='#f6f5f5')
plt.scatter(test_prediction-y_test, np.arange(len(y_test)))
plt.show()

joblib.dump(final_model, 'xgb—lgb-linearReges.pkl')