In [None]:
import numpy as np 
import pandas as pd
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

# Set Matplotlib defaults
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc(
    "axes",
    labelweight="bold",
    labelsize="large",
    titleweight="bold",
    titlesize=14,
    titlepad=10,
)

In [None]:
def cal_mi_score(X,y):
    X = X.copy()
    X.drop('id', axis=1, inplace=True)
    for col in X.select_dtypes(['object']):
        X[col],unique = X[col].factorize()
    discrete_features = X.dtypes == int
    mi_scores = mutual_info_regression(X,y,discrete_features=discrete_features, random_state=42)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

def plot_mi_scores(scores):
    scores = scores.sort_values(ascending=True)
    width = np.arange(len(scores))
    ticks = list(scores.index)
    plt.figure(figsize=(10,7))
    plt.barh(width, scores)
    plt.yticks(width, ticks)
    plt.title("Mutual Information Scores")
    
df_eda = pd.read_csv("../input/30-days-of-ml/train.csv")
target = df_eda.pop('target')
scores = cal_mi_score(df_eda,target)
plot_mi_scores(scores)

### Why using Mutual Information to find any relation between feature and target ?
Ans: Actually mutual information can predict any kind of relationship while correlation just predict the linear relation 
     between features and target
     
 **You can see that `cont12` and `cont10` are more related to target than other features**

# ***Blending + Stacking***
### - **I used 5 models with different hyperparameters and features for blending**
>     - Ordinal encoder + standardization
>     - Ordinal encoder + standardization (diff hyperparameters)
>     - target encoding + standardization 
>     - one hot encoding + standardization
>     - LGBRegressor

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

for n in range(1,6):
    data = pd.read_csv(f"../input/custompreddata/train_pred_{n}.csv")
    df = df.merge(data, on="id", how="left")
    
all_test_df = []
for n in range(1,6):
    data = pd.read_csv(f'../input/custompreddata/test_pred_{n}.csv')
    df_test = df_test.merge(data, on="id", how="left")


### - **I used 3 different models for stacking**
>     - XGBRegressor
>     - RandomForestRegressor
>     - LGBRegressor

In [None]:
# Meta model 

useful_features = [col for col in df.columns if col.startswith('pred')]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    x_train = df[df.kfold != fold].reset_index(drop=True)
    x_valid = df[df.kfold == fold].reset_index(drop=True)
    x_test = df_test.copy()
    
    valid_ids = x_valid.id.values.tolist()
    
    y_train = x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[useful_features]
    x_valid = x_valid[useful_features]
    params = {'learning_rate': 0.07803392035787837, 
              'reg_lambda': 1.7549293092194938e-05, 
              'reg_alpha': 20.68267919457715, 
              'subsample': 0.8031450486786944, 
              'colsample_bytree': 0.170759104940733, 
              'max_depth': 3}
   
    model = XGBRegressor(random_state = fold,
#                          tree_method="gpu_hist",
#                          gpu_id=0,
#                          predictor="gpu_predictor",
                         n_estimators=5000,
                         n_jobs = -1,
                         **params)
    model.fit(x_train,
              y_train,
              early_stopping_rounds=300,
              eval_set=[(x_valid, y_valid)],
              verbose=1000,
             )
    
    preds_valid = model.predict(x_valid)
    preds_test = model.predict(x_test)
    final_test_predictions.append(preds_test)
    final_valid_predictions.update(dict(zip(valid_ids,preds_valid)))
    rmse = mean_squared_error(y_valid,preds_valid, squared=False)
    scores.append(rmse)
    print(fold,rmse)
    
print(np.mean(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id","pred_4"]
final_valid_predictions.to_csv("level1_train_pred_4.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ['id', 'pred_4']
sample_submission.to_csv('level1_test_pred_4.csv', index=False)

In [None]:
# meta model 2 

useful_features = [col for col in df.columns if col.startswith('pred')]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    x_train = df[df.kfold != fold].reset_index(drop=True)
    x_valid = df[df.kfold == fold].reset_index(drop=True)
    x_test = df_test.copy()
    
    valid_ids = x_valid.id.values.tolist()
    
    y_train = x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[useful_features]
    x_valid = x_valid[useful_features]
    
    model = RandomForestRegressor(n_estimators=500, max_depth=3, n_jobs=-1,random_state=42)
    model.fit(x_train,
              y_train,)
    
    preds_valid = model.predict(x_valid)
    preds_test = model.predict(x_test)
    final_test_predictions.append(preds_test)
    final_valid_predictions.update(dict(zip(valid_ids,preds_valid)))
    rmse = mean_squared_error(y_valid,preds_valid, squared=False)
    scores.append(rmse)
    print(fold,rmse)
    
print(np.mean(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id","pred_5"]
final_valid_predictions.to_csv("level1_train_pred_5.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ['id', 'pred_5']
sample_submission.to_csv('level1_test_pred_5.csv', index=False)

In [None]:
# meta model 3

useful_features = [col for col in df.columns if col.startswith('pred')]
df_test = df_test[useful_features]

final_test_predictions = []
final_valid_predictions = {}
scores = []
for fold in range(5):
    x_train = df[df.kfold != fold].reset_index(drop=True)
    x_valid = df[df.kfold == fold].reset_index(drop=True)
    x_test = df_test.copy()
    
    valid_ids = x_valid.id.values.tolist()
    
    y_train = x_train.target
    y_valid = x_valid.target
    
    x_train = x_train[useful_features]
    x_valid = x_valid[useful_features]
    params = {
        'max_depth':3,
        'colsample_bytree': 0.4,  
        'learning_rate': 0.1,  
        'min_child_weight': 1,  
        'reg_alpha': 10.0,  
        'reg_lambda': 1.0,  
        'subsample': 0.7266579209776919,
        'random_state': 42
    }
    model = lgb.LGBMRegressor(**params)
    model.fit(x_train, y_train,)
    
    preds_valid = model.predict(x_valid)
    preds_test = model.predict(x_test)
    final_test_predictions.append(preds_test)
    final_valid_predictions.update(dict(zip(valid_ids,preds_valid)))
    rmse = mean_squared_error(y_valid,preds_valid, squared=False)
    scores.append(rmse)
    print(fold,rmse)
    
print(np.mean(scores))

final_valid_predictions = pd.DataFrame.from_dict(final_valid_predictions, orient="index").reset_index()
final_valid_predictions.columns = ["id","pred_6"]
final_valid_predictions.to_csv("level1_train_pred_6.csv", index=False)

sample_submission.target = np.mean(np.column_stack(final_test_predictions), axis=1)
sample_submission.columns = ['id', 'pred_6']
sample_submission.to_csv('level1_test_pred_6.csv', index=False)

> **I uploaded my blending + stacking data into 2 datasets**
> **`level-data` and**
> **`custompreddata`**

In [None]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

for n in range(1,4):
    d = pd.read_csv(f'../input/level-data/level1_train_pred_{n}.csv')
    df = df.merge(d, on="id", how="left")


for n in range(1,4):
    d = pd.read_csv(f'../input/level-data/level1_test_pred_{n}.csv')
    df_test = df_test.merge(d, on="id", how="left")
    
    
for n in range(4,7):
    d = pd.read_csv(f'../input/custompreddata/level1_train_pred_{n}.csv')
    df = df.merge(d, on="id", how="left")


for n in range(4,7):
    d = pd.read_csv(f'../input/custompreddata/level1_test_pred_{n}.csv')
    df_test = df_test.merge(d, on="id", how="left")  

In [None]:
useful_features = [f'pred_{n}' for n in range(1,7) if n not in [5]]
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain =  df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()

    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]
    
    enc = preprocessing.StandardScaler()
    df[useful_features] = enc.fit_transform(df[useful_features])
    df_test[useful_features] = enc.transform(df_test[useful_features])
    
    model = LinearRegression()
    model.fit(xtrain, ytrain)
    
    preds_valid = model.predict(xvalid)
    test_preds = model.predict(xtest)
    final_predictions.append(test_preds)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

In [None]:
sample_submission.target = np.mean(np.column_stack(final_predictions), axis=1)
sample_submission.to_csv("submission.csv", index=False)