# Importing Libraries

In [145]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from xgboost import XGBRegressor
from xgboost import plot_importance
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error as rmse
import warnings
warnings.filterwarnings("ignore")

# Reading the data

In [122]:
train = pd.read_csv('train.csv', index_col = "id")
test = pd.read_csv('test.csv', index_col = "id")
submission = pd.read_csv('sample_submission.csv', index_col = "id")

# Preprocessing

In [123]:
train.drop('major', inplace = True, axis = 1)
test.drop('major', inplace = True, axis = 1)
train.dropna(inplace=True)

In [124]:
train.drop('country', inplace = True, axis = 1)
test.drop('country', inplace = True, axis = 1)
train.dropna(inplace=True)

In [125]:
train['race_arab'] = pd.to_numeric(train['race_arab'], errors='coerce').fillna(-1).astype('int')
train['race_asian'] = pd.to_numeric(train['race_asian'], errors='coerce').fillna(-1).astype('int')
train['race_black'] = pd.to_numeric(train['race_black'], errors='coerce').fillna(-1).astype('int')
train['race_white'] = pd.to_numeric(train['race_white'], errors='coerce').fillna(-1).astype('int')

test['race_arab'] = pd.to_numeric(test['race_arab'], errors='coerce').fillna(-1).astype('int')
test['race_asian'] = pd.to_numeric(test['race_asian'], errors='coerce').fillna(-1).astype('int')
test['race_black'] = pd.to_numeric(test['race_black'], errors='coerce').fillna(-1).astype('int')
test['race_white'] = pd.to_numeric(test['race_white'], errors='coerce').fillna(-1).astype('int')

In [126]:
train["age"] = train["age"].where(train["age"] <= 100)
train["age"].fillna(train["age"].mean(), inplace = True)
train["age"] = train["age"].astype('int')

test["age"] = test["age"].where(test["age"] <= 100)
test["age"].fillna(test["age"].mean(), inplace = True)
test["age"] = test["age"].astype('int')

In [127]:
train["familysize"] = train["familysize"].where(train["familysize"] <= 10)
train["familysize"].fillna(train["familysize"].mean(), inplace = True)
train["familysize"] = train["familysize"].astype('int')

test["familysize"] = test["familysize"].where(test["familysize"] <= 10)
test["familysize"].fillna(test["familysize"].mean(), inplace = True)
test["familysize"] = test["familysize"].astype('int')

In [128]:
train["log_introelapse"] = np.log1p(train["introelapse"])
train["log_testelapse"] = np.log1p(train["testelapse"])
train["log_surveyelapse"] = np.log1p(train["surveyelapse"])

test["log_introelapse"] = np.log1p(test["introelapse"])
test["log_testelapse"] = np.log1p(test["testelapse"])
test["log_surveyelapse"] = np.log1p(test["surveyelapse"])

In [129]:
cols = train.columns
cols = list(cols)
cols.remove('nerdy')

# Scaling the data

In [130]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train[cols])
test_scaled = scaler.transform(test)

In [131]:
train_scaled = pd.DataFrame(data = train_scaled, index = train.index, columns = cols)
test_scaled = pd.DataFrame(data = test_scaled, index = test.index, columns = cols)

In [132]:
train_scaled['nerdy'] = train['nerdy']

# Training Models

In [133]:
def Stacker(model, model_name, train_data, test_data, fold):
    test_preds = np.zeros(test_data.shape[0])
    train_preds = np.zeros(train_data.shape[0])
    
    kf = StratifiedKFold(n_splits=fold,random_state=48,shuffle=True)
    rmse_list=[]
    n=0
    
    for train_index, test_index in kf.split(train_data[cols],train_data['nerdy']):
        
        X_train, X_valid = train_data[cols].iloc[train_index], train_data[cols].iloc[test_index]
        y_train, y_valid = train_data['nerdy'].iloc[train_index], train_data['nerdy'].iloc[test_index]
        
        if model_name == 'catb':
            model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], silent=True)
        elif model_name == 'xgb' or model_name == 'lgbm':
            model.fit(X_train, y_train, eval_set=[(X_valid,y_valid)], early_stopping_rounds=100, eval_metric="rmse", verbose=False)
        else:
            model.fit(X_train, y_train)
    
        test_preds += model.predict(test_data[cols])/kf.n_splits
        train_preds += model.predict(train_data[cols])/kf.n_splits
        
        rmse_list.append(np.sqrt(rmse(y_valid, model.predict(X_valid))))
        
        print(f"fold: {n+1}, rmse: {rmse_list[n]}")
        n+=1  
    return train_preds, test_preds

In [137]:
lr = LinearRegression()
linear_train, linear_test = Stacker(lr, 'lr', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.2339166539836521
fold: 2, rmse: 1.2305278768463954
fold: 3, rmse: 1.2368949262945386
fold: 4, rmse: 1.2449839319403155
fold: 5, rmse: 1.238406602106878


In [135]:
lgbm = LGBMRegressor()
lgbm_train, lgbm_test = Stacker(lgbm, 'lgbm', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.1069166396319328
fold: 2, rmse: 1.1114805030856163
fold: 3, rmse: 1.1060851173940727
fold: 4, rmse: 1.1209112975226432
fold: 5, rmse: 1.1149785481471555


In [136]:
catb = CatBoostRegressor(use_best_model=True, iterations=1000, eval_metric='RMSE')
catb_train, catb_test = Stacker(catb, 'catb', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.0988870284646
fold: 2, rmse: 1.1102350347071694
fold: 3, rmse: 1.0989500532513725
fold: 4, rmse: 1.1165941969287367
fold: 5, rmse: 1.1026734363136652


In [152]:
xgb = XGBRegressor(n_estimators=1000)
xgb_train, xgb_test = Stacker(xgb, 'xgb', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.11764668720475
fold: 2, rmse: 1.143263560866686
fold: 3, rmse: 1.1367534771295764
fold: 4, rmse: 1.1526447832280633
fold: 5, rmse: 1.1381122537635717


In [138]:
svmac = svm.SVR()
svmac_train, svmac_test = Stacker(svmac, 'svr', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.1684038316250056
fold: 2, rmse: 1.1717675498589148
fold: 3, rmse: 1.1643791908208367
fold: 4, rmse: 1.1778217716859596
fold: 5, rmse: 1.1579661584814673


In [143]:
forest = RandomForestRegressor()
forest_train, forest_test = Stacker(forest, 'forest', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.112549149257852
fold: 2, rmse: 1.133327401070487
fold: 3, rmse: 1.1323174593800627
fold: 4, rmse: 1.153019226960067
fold: 5, rmse: 1.12936587673908


In [146]:
adaboost = AdaBoostRegressor()
adaboost_train, adaboost_test = Stacker(adaboost, 'adaboost', train_scaled, test_scaled, 5)

fold: 1, rmse: 1.3018681247041817
fold: 2, rmse: 1.379108481399514
fold: 3, rmse: 1.3645429041438752
fold: 4, rmse: 1.400085511576865
fold: 5, rmse: 1.3464796042863911


# Stacking

In [200]:
stack_train = np.concatenate((catb_train.reshape(-1,1), lgbm_train.reshape(-1,1), xgb_train.reshape(-1,1), svmac_train.reshape(-1,1), forest_train.reshape(-1,1)), axis = 1)
stack_test = np.concatenate((catb_test.reshape(-1,1), lgbm_test.reshape(-1,1), xgb_test.reshape(-1,1), svmac_test.reshape(-1,1), forest_test.reshape(-1,1)), axis = 1)

stack_train = pd.DataFrame(stack_train, columns = ['catb', 'lgbm', 'xgb', 'svmac', 'forest'])
stack_test = pd.DataFrame(stack_test, columns = ['catb', 'lgbm', 'xgb', 'svmac', 'forest'])

In [201]:
stack_train.index = train_scaled.index
stack_test.index = test_scaled.index

In [180]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

In [202]:
fs = SelectKBest(f_regression)
train_scaled_best = fs.fit_transform(train_scaled[cols], train_scaled['nerdy'])
test_scaled_best = fs.transform(test_scaled)

In [203]:
train_scaled_best = pd.DataFrame(train_scaled_best, columns = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10'])
test_scaled_best = pd.DataFrame(test_scaled_best, columns = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9', 'F10'])

In [204]:
train_scaled_best.index = train_scaled.index
test_scaled_best.index = test_scaled.index

In [205]:
stack_train = pd.concat([train_scaled_best, stack_train], axis=1, join='inner')
stack_test = pd.concat([test_scaled_best, stack_test], axis=1, join='inner')

In [207]:
y = train['nerdy'].copy()

catb = CatBoostRegressor(use_best_model=True, iterations=1000, eval_metric='RMSE')

train_preds = np.zeros(stack_train.shape[0])
test_preds = np.zeros(stack_test.shape[0])

kf = StratifiedKFold(n_splits=5, random_state=2021, shuffle=True)
rmse_list=[]

n=0
for train_index, test_index in kf.split(stack_train, y):
    
    X_train, X_valid = stack_train.iloc[train_index], stack_train.iloc[test_index]
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    
    catb.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], silent=True)
    
    train_preds += catb.predict(stack_train)/kf.n_splits
    test_preds += catb.predict(stack_test)/kf.n_splits
    
    rmse_list.append(np.sqrt(rmse(y_valid, catb.predict(X_valid))))
    
    print(f"fold: {n+1}, rmse: {rmse_list[n]}")
    n+=1

fold: 1, rmse: 0.1465482535224751
fold: 2, rmse: 0.14912840310213507
fold: 3, rmse: 0.16972436756892342
fold: 4, rmse: 0.1478657566430714
fold: 5, rmse: 0.15527702381714326


In [208]:
submission['nerdy'] = test_preds
submission.to_csv('submission.csv')