In [51]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn import svm
from sklearn import tree
from catboost import CatBoostRegressor
import seaborn as sns
from sklearn.naive_bayes import GaussianNB 
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
import pickle
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential, Model
from keras.layers import Activation, Dense, Dropout
from sklearn.preprocessing import LabelBinarizer
import sklearn.datasets as skds
from pathlib import Path
import itertools
from sklearn.metrics import confusion_matrix
from sklearn.metrics import mean_squared_error as rmse
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
from joblib import dump, load
from xgboost import plot_importance
from lightgbm import LGBMRegressor

In [83]:
train = pd.read_csv('pps_train.csv', index_col = "id")
test = pd.read_csv('pps_test.csv', index_col = "id")
submission = pd.read_csv('sample_submission.csv', index_col = "id")

In [90]:
cols= train.columns
cols = list(cols)
cols.pop()

'nerdy'

In [173]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train[cols])
test_scaled = scaler.transform(test)

In [174]:
train_scaled = pd.DataFrame(data = train_scaled, index = train.index, columns = cols)
test_scaled = pd.DataFrame(data = test_scaled, index = test.index, columns = cols)

In [175]:
train_scaled['nerdy'] = train['nerdy']

In [176]:
def Stacker(model, model_name, train_data, test_data, fold):
    test_preds = np.zeros(test_data.shape[0])
    train_preds = np.zeros(train_data.shape[0])
    
    kf = StratifiedKFold(n_splits=fold,random_state=48,shuffle=True)
    rmse_list=[]
    n=0
    
    for train_index, test_index in kf.split(train_data[cols],train_data['nerdy']):
        
        X_train, X_valid = train_data[cols].iloc[train_index], train_data[cols].iloc[test_index]
        y_train, y_valid = train_data['nerdy'].iloc[train_index], train_data['nerdy'].iloc[test_index]
        
        if model_name == 'catb':
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], silent=True)
        elif model_name == 'xgb' or model_name == 'lgbm':
            model.fit(X_train, y_train, eval_set=[(X_valid,y_valid)], early_stopping_rounds=100, eval_metric="rmse", verbose=False)
        else:
            model.fit(X_train, y_train)
    
        test_preds += model.predict(test_data[cols])/kf.n_splits
        train_preds += model.predict(train_data[cols])/kf.n_splits
        
        rmse_list.append(np.sqrt(rmse(y_valid, model.predict(X_valid))))
        
        print(f"fold: {n+1}, rmse: {rmse_list[n]}")
        n+=1  
    return train_preds, test_preds

In [113]:
lr = LinearRegression()
Stacker(lr, 'lr', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.3018837748632561
fold: 2, rmse: 1.284105092123815
fold: 3, rmse: 1.3072366097767893
fold: 4, rmse: 1.3084202778998446
fold: 5, rmse: 1.306513859004915


(array([6.15227345, 4.42164907, 6.12780567, ..., 4.85016154, 5.31437448,
        6.09039454]),
 array([5.76254333, 5.59257076, 4.57904162, ..., 5.35403039, 5.2579529 ,
        5.92994686]))

In [114]:
svmac = svm.SVR()
Stacker(svmac, 'svr', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.3029695640010048
fold: 2, rmse: 1.300390362122223
fold: 3, rmse: 1.3150008889702351
fold: 4, rmse: 1.3106720647572592
fold: 5, rmse: 1.309579875972343


(array([6.25922791, 4.63581668, 5.93756967, ..., 4.76417613, 5.84598915,
        6.65043437]),
 array([5.75991553, 5.57372015, 4.34303516, ..., 5.38504888, 5.44412303,
        5.85185913]))

In [177]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor()
Stacker(forest, 'forest', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.2569461579048724
fold: 2, rmse: 1.267727734462092
fold: 3, rmse: 1.2938154659816132
fold: 4, rmse: 1.2884147588719435
fold: 5, rmse: 1.292330815189734


(array([5.912, 4.158, 3.02 , ..., 4.488, 5.888, 6.596]),
 array([5.798, 5.392, 4.65 , ..., 5.31 , 5.164, 5.888]))

In [115]:
lgbm = LGBMRegressor()
xgb = XGBRegressor(n_estimators=1000)
catb = CatBoostRegressor(use_best_model=True, iterations=1000, eval_metric='RMSE')

In [119]:
lgbm_train, lgbm_test = Stacker(lgbm, 'lgbm', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.2427107945436195
fold: 2, rmse: 1.2264432388846178
fold: 3, rmse: 1.2604149109553822
fold: 4, rmse: 1.2577584863031788
fold: 5, rmse: 1.2528735157788808


In [120]:
xgb_train, xgb_test = Stacker(xgb, 'xgb', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.2588081855399502
fold: 2, rmse: 1.2882251264534668
fold: 3, rmse: 1.293303372029544
fold: 4, rmse: 1.286039226718986
fold: 5, rmse: 1.2721870313357795


In [181]:
catb_train, catb_test = Stacker(catb, 'catb', train_scaled, test_scaled, 20)

fold: 1, rmse: 1.2068344893971403
fold: 2, rmse: 1.232435382755793
fold: 3, rmse: 1.1656800806968797
fold: 4, rmse: 1.2622034371765485
fold: 5, rmse: 1.226614320793999
fold: 6, rmse: 1.2395603508758437
fold: 7, rmse: 1.1621325109593574
fold: 8, rmse: 1.23624437325724
fold: 9, rmse: 1.2776889378913365
fold: 10, rmse: 1.2268162478648206
fold: 11, rmse: 1.187906229958538
fold: 12, rmse: 1.29459231648044
fold: 13, rmse: 1.2543717480890513
fold: 14, rmse: 1.2124717810801233
fold: 15, rmse: 1.2041710863460697
fold: 16, rmse: 1.2594987618235332
fold: 17, rmse: 1.2324887458197482
fold: 18, rmse: 1.2457912979516699
fold: 19, rmse: 1.2484887462401615
fold: 20, rmse: 1.2067156576497486


In [179]:
# submission['nerdy'] = catb_test
# submission.to_csv('submission.csv')

In [122]:
stack_train = np.concatenate((catb_train.reshape(-1,1), lgbm_train.reshape(-1,1), xgb_train.reshape(-1,1)), axis = 1)
stack_test = np.concatenate((catb_test.reshape(-1,1), lgbm_test.reshape(-1,1), xgb_test.reshape(-1,1)), axis = 1)

stack_train = pd.DataFrame(stack_train, columns = ['catb', 'lgbm', 'xgb'])
stack_test = pd.DataFrame(stack_test, columns = ['catb', 'lgbm', 'xgb'])

In [134]:
y = train['nerdy'].copy()

lr = LinearRegression()

train_preds = np.zeros(stack_train.shape[0])
test_preds = np.zeros(stack_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
rmse_list=[]

n=0
for train_index, test_index in kf.split(stack_train, y):
    
    X_train, X_valid = stack_train.iloc[train_index], stack_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    lr.fit(X_train, y_train)
    
    train_preds += lr.predict(stack_train)/kf.n_splits
    test_preds += lr.predict(stack_test)/kf.n_splits
    
    rmse_list.append(np.sqrt(rmse(y_valid, lr.predict(X_valid))))
    
    print(f"fold: {n+1}, rmse: {rmse_list[n]}")
    n+=1

fold: 1, rmse: 1.0098029569348246
fold: 2, rmse: 1.0143327101418296
fold: 3, rmse: 0.9849075171583546
fold: 4, rmse: 0.9647737253988553
fold: 5, rmse: 0.9848239380051178


In [135]:
test_preds

array([6.30247207, 6.08720675, 4.30350681, ..., 5.48875416, 4.93500003,
       5.94965323])

In [136]:
# submission['nerdy'] = test_preds
# submission.to_csv('submission.csv')

In [137]:
y = train['nerdy'].copy()

svmac = svm.SVR()

train_preds = np.zeros(stack_train.shape[0])
test_preds = np.zeros(stack_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
rmse_list=[]

n=0
for train_index, test_index in kf.split(stack_train, y):
    
    X_train, X_valid = stack_train.iloc[train_index], stack_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    svmac.fit(X_train, y_train)
    
    train_preds += svmac.predict(stack_train)/kf.n_splits
    test_preds += svmac.predict(stack_test)/kf.n_splits
    
    rmse_list.append(np.sqrt(rmse(y_valid, svmac.predict(X_valid))))
    
    print(f"fold: {n+1}, rmse: {rmse_list[n]}")
    n+=1

fold: 1, rmse: 1.0097389917874144
fold: 2, rmse: 1.0200192349962671
fold: 3, rmse: 0.9889883631587247
fold: 4, rmse: 0.9554802758310246
fold: 5, rmse: 0.9738784245319945


In [138]:
test_preds

array([6.49037807, 6.34582771, 4.57864252, ..., 5.61893355, 5.07344166,
       6.0425359 ])

In [143]:
y = train['nerdy'].copy()

catb = CatBoostRegressor(use_best_model=True, iterations=1000, eval_metric='RMSE')

train_preds = np.zeros(stack_train.shape[0])
test_preds = np.zeros(stack_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
rmse_list=[]

n=0
for train_index, test_index in kf.split(stack_train, y):
    
    X_train, X_valid = stack_train.iloc[train_index], stack_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    catb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], silent=True)
    
    train_preds += catb.predict(stack_train)/kf.n_splits
    test_preds += catb.predict(stack_test)/kf.n_splits
    
    rmse_list.append(np.sqrt(rmse(y_valid, catb.predict(X_valid))))
    
    print(f"fold: {n+1}, rmse: {rmse_list[n]}")
    n+=1

fold: 1, rmse: 1.009155937277
fold: 2, rmse: 1.0052914237469608
fold: 3, rmse: 0.9733946263685406
fold: 4, rmse: 0.9497671451618258
fold: 5, rmse: 0.9611712402101159


In [144]:
test_preds

array([6.28694998, 6.19147039, 4.55147074, ..., 5.55276469, 5.05076076,
       5.98456787])

In [145]:
submission['nerdy'] = test_preds
submission.to_csv('submission.csv')

In [141]:
y = train['nerdy'].copy()

lgbm = LGBMRegressor()

train_preds = np.zeros(stack_train.shape[0])
test_preds = np.zeros(stack_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
rmse_list=[]

n=0
for train_index, test_index in kf.split(stack_train, y):
    
    X_train, X_valid = stack_train.iloc[train_index], stack_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    lgbm.fit(X_train, y_train, eval_set=[(X_valid,y_valid)], early_stopping_rounds=100, eval_metric="rmse", verbose=False)
    
    train_preds += lgbm.predict(stack_train)/kf.n_splits
    test_preds += lgbm.predict(stack_test)/kf.n_splits
    
    rmse_list.append(np.sqrt(rmse(y_valid, lgbm.predict(X_valid))))
    
    print(f"fold: {n+1}, rmse: {rmse_list[n]}")
    n+=1

fold: 1, rmse: 1.0077732585964805
fold: 2, rmse: 1.0072397668186348
fold: 3, rmse: 0.977059916344051
fold: 4, rmse: 0.9562801016574247
fold: 5, rmse: 0.9761586715295005


In [142]:
test_preds

array([6.27760254, 6.21761237, 4.62057775, ..., 5.6018471 , 5.0140371 ,
       5.94141759])

In [202]:
stack_train.index = train_scaled.index
stack_test.index = test_scaled.index

In [203]:
new_train = pd.concat([train_scaled, stack_train], axis=1, join='inner')
new_test = pd.concat([test_scaled, stack_test], axis=1, join='inner')

In [204]:
new_train.drop(['nerdy'], axis=1, inplace=True)

In [206]:
y = train['nerdy'].copy()

lr = LinearRegression()

train_preds = np.zeros(new_train.shape[0])
test_preds = np.zeros(new_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
rmse_list=[]

n=0
for train_index, test_index in kf.split(new_train, y):
    
    X_train, X_valid = new_train.iloc[train_index], new_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    lr.fit(X_train, y_train)
    
    train_preds += lr.predict(new_train)/kf.n_splits
    test_preds += lr.predict(new_test)/kf.n_splits
    
    rmse_list.append(np.sqrt(rmse(y_valid, lr.predict(X_valid))))
    
    print(f"fold: {n+1}, rmse: {rmse_list[n]}")
    n+=1

fold: 1, rmse: 0.9521940156036769
fold: 2, rmse: 0.9466811203697204
fold: 3, rmse: 0.9339957660526332
fold: 4, rmse: 0.9023046052315414
fold: 5, rmse: 0.9267638151297408


In [207]:
test_preds

array([6.42329287, 6.07231259, 4.40870223, ..., 5.44871533, 4.87007096,
       5.84057951])

In [208]:
# submission['nerdy'] = test_preds
# submission.to_csv('submission.csv')

In [211]:
y = train['nerdy'].copy()

catb = CatBoostRegressor(use_best_model=True, iterations=1000, eval_metric='RMSE')

train_preds = np.zeros(new_train.shape[0])
test_preds = np.zeros(new_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
rmse_list=[]

n=0
for train_index, test_index in kf.split(new_train, y):
    
    X_train, X_valid = new_train.iloc[train_index], new_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    catb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], silent=True)
    
    train_preds += catb.predict(new_train)/kf.n_splits
    test_preds += catb.predict(new_test)/kf.n_splits
    
    rmse_list.append(np.sqrt(rmse(y_valid, catb.predict(X_valid))))
    
    print(f"fold: {n+1}, rmse: {rmse_list[n]}")
    n+=1

fold: 1, rmse: 0.8765348050730081
fold: 2, rmse: 0.86700348838446
fold: 3, rmse: 0.8541197211382017
fold: 4, rmse: 0.831473838194112
fold: 5, rmse: 0.8354976885205355


In [210]:
# submission['nerdy'] = test_preds
# submission.to_csv('submission.csv')

In [212]:
y = train['nerdy'].copy()

svmac = svm.SVR()

train_preds = np.zeros(new_train.shape[0])
test_preds = np.zeros(new_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
rmse_list=[]

n=0
for train_index, test_index in kf.split(new_train, y):
    
    X_train, X_valid = new_train.iloc[train_index], new_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    svmac.fit(X_train, y_train)
    
    train_preds += svmac.predict(new_train)/kf.n_splits
    test_preds += svmac.predict(new_test)/kf.n_splits
    
    rmse_list.append(np.sqrt(rmse(y_valid, svmac.predict(X_valid))))
    
    print(f"fold: {n+1}, rmse: {rmse_list[n]}")
    n+=1

fold: 1, rmse: 0.9117096177445297
fold: 2, rmse: 0.9046491279171053
fold: 3, rmse: 0.9094484575488158
fold: 4, rmse: 0.8851907369119049
fold: 5, rmse: 0.8737084321311355


In [213]:
submission['nerdy'] = test_preds
submission.to_csv('submission.csv')