In [15]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [45]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]  
numerical_cols = [col for col in useful_features if 'cont' in col]  
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(n_estimators=500, learning_rate=0.05, random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain, 
                 early_stopping_rounds=5, 
                 eval_set=[(xvalid, yvalid)], 
                 verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7212601695648094
1 0.7212717965648511
2 0.7235314596583627
3 0.7232004339051845
4 0.7223953440583651
0.7223318407503145 0.0009454887697211359


In [51]:
#standardization

df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]  
numerical_cols = [col for col in useful_features if 'cont' in col]  
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
        
    standard_scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = standard_scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = standard_scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = standard_scaler.transform(xtest[numerical_cols])

    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(n_estimators=500, learning_rate=0.05, 
                         random_state=fold, tree_method='gpu_hist', 
                         gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain, 
                 early_stopping_rounds=5, 
                 eval_set=[(xvalid, yvalid)], 
                 verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7213233358864336
1 0.7214287740229792
2 0.7234465366793505
3 0.7230555976561358
4 0.7219399023680014
0.7222388293225801 0.0008613179104731648


In [47]:
#log transformation

df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]  
numerical_cols = [col for col in useful_features if 'cont' in col]  
df_test = df_test[useful_features]

for col in numerical_cols:
    df[col] = np.log1p(df[col])    
    df_test[col] = np.log1p(df_test[col])
        
final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(n_estimators=500, learning_rate=0.05, 
                         random_state=fold, tree_method='gpu_hist', 
                         gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain, 
                 early_stopping_rounds=5, 
                 eval_set=[(xvalid, yvalid)], 
                 verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7214907786064982
1 0.7213750980366883
2 0.723242573004054
3 0.7244023589205874
4 0.7223490847067255
0.7225719786549107 0.0011358861887691555


In [48]:
#polynomial features

df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]   
df_test = df_test[useful_features]

poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(df_test[numerical_cols])

df_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1]) ])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1]) ])

df = pd.concat([df, df_poly], axis=1)
df_test = pd.concat([df_test, df_test_poly], axis=1)

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]   
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(n_estimators=500, learning_rate=0.05, 
                         random_state=fold, tree_method='gpu_hist', 
                         gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain, 
                 early_stopping_rounds=5, 
                 eval_set=[(xvalid, yvalid)], 
                 verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7227034234148775
1 0.7230842589534672
2 0.7246994389596196
3 0.7241129450127881
4 0.7234450729296387
0.7236090278540781 0.0007162837284940793


In [None]:
#binning the numerical features
#pd.cut


In [49]:
#one hot encoding
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]  
numerical_cols = [col for col in useful_features if 'cont' in col]  
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ohe = preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore")   
    xtrain_ohe = ohe.fit_transform(xtrain[object_cols])
    xvalid_ohe = ohe.transform(xvalid[object_cols])
    xtest_ohe = ohe.transform(xtest[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"ohe_{i}" for i in range(xtrain_ohe.shape[1]) ])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"ohe_{i}" for i in range(xvalid_ohe.shape[1]) ])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"ohe_{i}" for i in range(xtest_ohe.shape[1]) ])

    xtrain = pd.concat([xtrain[numerical_cols], xtrain_ohe], axis=1)
    xvalid = pd.concat([xvalid[numerical_cols], xvalid_ohe], axis=1)
    xtest = pd.concat([xtest[numerical_cols], xtest_ohe], axis=1)
   
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(n_estimators=500, learning_rate=0.05, 
                         random_state=fold, tree_method='gpu_hist', 
                         gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain, 
                 early_stopping_rounds=5, 
                 eval_set=[(xvalid, yvalid)], 
                 verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7215825356491198
1 0.720850092710109
2 0.7234806320985059
3 0.7231945066438883
4 0.7221716416174069
0.722255881743806 0.0009816072843840935


In [None]:
#one hot encoding of categorical variables + standardization of ohe & numerical


In [50]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]    
df_test = df_test[useful_features]

for col in object_cols:
    df[f"cont_{col}"] = df.groupby(col)[col].transform("count")
    df_test[f"cont_{col}"] = df_test.groupby(col)[col].transform("count")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]    
df_test = df_test[useful_features]    

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #print(fold, "encoding")
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(n_estimators=500, learning_rate=0.05, 
                         random_state=fold, tree_method='gpu_hist', 
                         gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain, 
                 early_stopping_rounds=5, 
                 eval_set=[(xvalid, yvalid)], 
                 verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7219067190579502
1 0.7211845086289989
2 0.7232159275284927
3 0.7236404919052575
4 0.722350346440966
0.7224595987123331 0.0008843086229247444


In [None]:
#combine categorical columns
#cat1_cat2
#df[cat1] + "_" + df[cat2]

In [None]:
#combine categorical columns with numerical using groupby 
#and calculate mean, median, max, etc as new features

with second calculation which is standardization

In [52]:
np.column_stack(final_predictions).shape
preds = np.mean(np.column_stack(final_predictions), axis = 1)

In [53]:
sample_submission.target = preds
sample_submission.to_csv('submission.csv', index=False)