In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

#import gc
import matplotlib.pyplot as plt
import seaborn as sns



from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [None]:
train_df = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/train.csv')
test_df = pd.read_csv('/kaggle/input/santander-value-prediction-challenge/test.csv')

# EDA

In [None]:
print('Shape of train data', train_df.shape)
print('Shape of test data', test_df.shape)

In [None]:
# check for null values
print('num of null values in train set', train_df.isnull().sum().sum())

# check for null values
print('num of null values in test set', test_df.isnull().sum().sum())

In [None]:
train_df.head(10)

In [None]:
train_df.tail(10)

In [None]:
dtype_df = train_df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df.groupby("Column Type").aggregate('count').reset_index()

In [None]:
plt.figure(figsize=(12,8))
sns.distplot(train_df["target"].values, bins=50, kde=False)
plt.xlabel('Target', fontsize=12)
plt.title("Target Histogram", fontsize=14)


In [None]:
# use log1p instead of log to get the normalized value for even tiny values -> 0

plt.figure(figsize=(12,8))
sns.distplot( np.log1p(train_df["target"].values), bins=50, kde=False)
plt.xlabel('Target', fontsize=12)
plt.title("Log of Target Histogram", fontsize=14)


In [None]:
# check and remove constant columns
colsToRemove = []
for col in train_df.columns:
    if col != 'ID' and col != 'target':
        if train_df[col].std() == 0: 
            colsToRemove.append(col)
        
# remove constant columns in the training set
train_df.drop(colsToRemove, axis=1, inplace=True)

# remove constant columns in the test set
test_df.drop(colsToRemove, axis=1, inplace=True) 

print(f"Removed `{len(colsToRemove)}` Constant Columns\n")


In [None]:
# check for duplicate columns

def duplicate_columns(df):
    dups = []
    columns = df.columns

    for i in range(len(columns)):
        col1 = df.iloc[:, i]
        for j in range(i + 1, len(columns)):
            col2 = df.iloc[:, j]
            # break early if dtypes aren't the same (helps deal with
            # categorical dtypes)
            if col1.dtype is not col2.dtype:
                break
            # otherwise compare values
            if col1.equals(col2):
                dups.append(columns[i])
                break
    return dups


train_dups = duplicate_columns(train_df)
print('num of duplicated cols in the train set: ', len(train_dups))



In [None]:
# dropping useless features

useless_features = list(set( train_dups))

train_df = train_df.drop(useless_features, axis=1)
test_df = test_df.drop(useless_features, axis=1)

In [None]:
x = train_df.drop(train_df[['ID','target']],axis = 1)
y = np.log1p(train_df["target"])
X_test = test_df.drop(["ID"], axis=1)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_valid, y_train, y_valid = train_test_split(x, y,test_size=0.2, random_state=22)

# create model

In [None]:
def run_xgb(train_X, train_y, val_X, val_y, test_X):
    params = {'objective': 'reg:linear', 
          'eval_metric': 'rmse',
          'eta': 0.005,
          'max_depth': 15, 
          'subsample': 0.7, 
          'colsample_bytree': 0.5,
          'alpha':0,
          'random_state': 42, 
          'silent': True}
    
    tr_data = xgb.DMatrix(X_train, y_train)
    va_data = xgb.DMatrix(X_valid, y_valid)
    
    watchlist = [(tr_data, 'train'), (va_data, 'valid')]
    
    model_xgb = xgb.train(params, tr_data, 2000, watchlist, maximize=False, early_stopping_rounds = 30, verbose_eval=100)
    
    dtest = xgb.DMatrix(test_X)
    xgb_pred_y = np.expm1(model_xgb.predict(dtest, ntree_limit=model_xgb.best_ntree_limit))
    
    return xgb_pred_y, model_xgb

In [None]:
pred_test_xgb, model_xgb = run_xgb(X_train, X_valid, y_train, y_valid, X_test)
print("XGB Training Completed...")

# Submit to the competition


In [None]:
sub = pd.read_csv('../input/santander-value-prediction-challenge/sample_submission.csv')
sub["target"] = pred_test_xgb
print(sub.head())
sub.to_csv('sub_lgb_xgb.csv', index=False)