In [15]:
# Familiar imports
import numpy as np
import pandas as pd

# For ordinal encoding categorical variables, splitting data
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# For training random forest model
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

In [23]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]  
numerical_cols = [col for col in useful_features if 'cont' in col]  
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain)#, 
                 #early_stopping_rounds=5, 
                 #eval_set=[(xvalid, yvalid)], 
                 #verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7245705537554137
1 0.7242510333821858
2 0.7270667092065692
3 0.7268359229595335
4 0.7257178555909586
0.7256884149789322 0.0011430674400777338


In [24]:
#standardization

df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]  
numerical_cols = [col for col in useful_features if 'cont' in col]  
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
        
    standard_scaler = preprocessing.StandardScaler()
    xtrain[numerical_cols] = standard_scaler.fit_transform(xtrain[numerical_cols])
    xvalid[numerical_cols] = standard_scaler.transform(xvalid[numerical_cols])
    xtest[numerical_cols] = standard_scaler.transform(xtest[numerical_cols])

    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain)#, 
                 #early_stopping_rounds=5, 
                 #eval_set=[(xvalid, yvalid)], 
                 #verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7241755479182882
1 0.7241138968948254
2 0.7267386816038165
3 0.7268357864120136
4 0.725667388462628
0.7255062602583143 0.001185068397378747


In [25]:
#log transformation

df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]  
numerical_cols = [col for col in useful_features if 'cont' in col]  
df_test = df_test[useful_features]

for col in numerical_cols:
    df[col] = np.log1p(df[col])    
    df_test[col] = np.log1p(df_test[col])
        
final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain)#, 
                 #early_stopping_rounds=5, 
                 #eval_set=[(xvalid, yvalid)], 
                 #verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7245867071148808
1 0.7242518770698644
2 0.7269464580617742
3 0.7267203050271116
4 0.7255892005274619
0.7256189095602186 0.001087249680887288


In [34]:
#polynomial features

df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]   
df_test = df_test[useful_features]

poly = preprocessing.PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
train_poly = poly.fit_transform(df[numerical_cols])
test_poly = poly.fit_transform(df_test[numerical_cols])

df_poly = pd.DataFrame(train_poly, columns=[f"poly_{i}" for i in range(train_poly.shape[1]) ])
df_test_poly = pd.DataFrame(test_poly, columns=[f"poly_{i}" for i in range(test_poly.shape[1]) ])

df = pd.concat([df, df_poly], axis=1)
df_test = pd.concat([df_test, df_test_poly], axis=1)

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]   
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain)#, 
                 #early_stopping_rounds=5, 
                 #eval_set=[(xvalid, yvalid)], 
                 #verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7281216924936591
1 0.7271284825532909
2 0.7291530577271739
3 0.7286137858619506
4 0.7277160251257038
0.7281466087523557 0.0007005423652217357


In [None]:
#binning the numerical features
#pd.cut


In [38]:
#one hot encoding
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]  
numerical_cols = [col for col in useful_features if 'cont' in col]  
df_test = df_test[useful_features]

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    #print(fold, "encoding")
    ohe = preprocessing.OneHotEncoder(sparse=False, handle_unknown="ignore")   
    xtrain_ohe = ohe.fit_transform(xtrain[object_cols])
    xvalid_ohe = ohe.transform(xvalid[object_cols])
    xtest_ohe = ohe.transform(xtest[object_cols])
    
    xtrain_ohe = pd.DataFrame(xtrain_ohe, columns=[f"ohe_{i}" for i in range(xtrain_ohe.shape[1]) ])
    xvalid_ohe = pd.DataFrame(xvalid_ohe, columns=[f"ohe_{i}" for i in range(xvalid_ohe.shape[1]) ])
    xtest_ohe = pd.DataFrame(xtest_ohe, columns=[f"ohe_{i}" for i in range(xtest_ohe.shape[1]) ])

    xtrain = pd.concat([xtrain[numerical_cols], xtrain_ohe], axis=1)
    xvalid = pd.concat([xvalid[numerical_cols], xvalid_ohe], axis=1)
    xtest = pd.concat([xtest[numerical_cols], xtest_ohe], axis=1)
   
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain)#, 
                 #early_stopping_rounds=5, 
                 #eval_set=[(xvalid, yvalid)], 
                 #verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.7244255014738967
1 0.7245139958781214
2 0.7264465446086561
3 0.7264028943362871
4 0.7257096926265366
0.7254997257846996 0.0008811227736574191


In [None]:
#one hot encoding of categorical variables + standardization of ohe & numerical


In [43]:
df = pd.read_csv("../input/30days-folds/train_folds.csv")
df_test = pd.read_csv("../input/30-days-of-ml/test.csv")#, index_col=0)
sample_submission = pd.read_csv("../input/30-days-of-ml/sample_submission.csv")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if 'cat' in col]    
df_test = df_test[useful_features]

for col in object_cols:
    df[f"cont_{col}"] = df.groupby(col)[col].transform("count")
    df_test[f"cont_{col}"] = df_test.groupby(col)[col].transform("count")

useful_features = [col for col in df.columns if col not in ("id", "target", "kfold")]
object_cols = [col for col in useful_features if col.startswith("cat")]    
df_test = df_test[useful_features]    

final_predictions = []
scores = []
for fold in range(5):
    xtrain = df[df.kfold != fold].reset_index(drop=True)
    xvalid = df[df.kfold == fold].reset_index(drop=True)
    xtest = df_test.copy()
    
    ytrain = xtrain.target
    yvalid = xvalid.target
    
    xtrain = xtrain[useful_features]
    xvalid = xvalid[useful_features]

    ordinal_encoder = preprocessing.OrdinalEncoder()   
    xtrain[object_cols] = ordinal_encoder.fit_transform(xtrain[object_cols])
    xvalid[object_cols] = ordinal_encoder.transform(xvalid[object_cols])
    xtest[object_cols] = ordinal_encoder.transform(xtest[object_cols])
    
    #print(fold, "encoding")
    #print(fold, "training")
    #model = RandomForestRegressor(random_state=fold, n_jobs=-1)# verbose=10,)
    #model.fit(xtrain, ytrain)
    model = XGBRegressor(random_state=fold, tree_method='gpu_hist', gpu_id=0, predictor='gpu_predictor')
    model.fit(xtrain, ytrain)#, 
                 #early_stopping_rounds=5, 
                 #eval_set=[(xvalid, yvalid)], 
                 #verbose=False) # Your code here
    
    preds_valid = model.predict(xvalid)
    preds_test = model.predict(xtest)
    final_predictions.append(preds_test)
    rmse = mean_squared_error(yvalid, preds_valid, squared=False)
    print(fold, rmse)
    scores.append(rmse)

print(np.mean(scores), np.std(scores))

0 0.725003918437552
1 0.7243182079706946
2 0.7268538937829859
3 0.7269844614923947
4 0.7257976217671542
0.7257916206901563 0.0010337068440122927


In [None]:
#combine categorical columns
#cat1_cat2
#df[cat1] + "_" + df[cat2]

In [None]:
#combine categorical columns with numerical using groupby 
#and calculate mean, median, max, etc as new features

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target,kfold
0,1,B,B,B,C,B,B,A,E,C,...,0.160266,0.310921,0.389470,0.267559,0.237281,0.377873,0.322401,0.869850,8.113634,0
1,2,B,B,A,A,B,D,A,F,A,...,0.558922,0.516294,0.594928,0.341439,0.906013,0.921701,0.261975,0.465083,8.481233,2
2,3,A,A,A,C,B,D,A,D,A,...,0.375348,0.902567,0.555205,0.843531,0.748809,0.620126,0.541474,0.763846,8.364351,4
3,4,B,B,A,C,B,D,A,E,C,...,0.239061,0.732948,0.679618,0.574844,0.346010,0.714610,0.540150,0.280682,8.049253,3
4,6,A,A,A,C,B,D,A,E,A,...,0.420667,0.648182,0.684501,0.956692,1.000773,0.776742,0.625849,0.250823,7.972260,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
299995,499993,B,B,A,A,B,D,A,E,A,...,0.450538,0.934360,1.005077,0.853726,0.422541,1.063463,0.697685,0.506404,7.945605,4
299996,499996,A,B,A,C,B,B,A,E,E,...,0.508502,0.358247,0.257825,0.433525,0.301015,0.268447,0.577055,0.823611,7.326118,3
299997,499997,B,B,A,C,B,C,A,E,G,...,0.372425,0.364936,0.383224,0.551825,0.661007,0.629606,0.714139,0.245732,8.706755,1
299998,499998,A,B,A,C,B,B,A,E,E,...,0.424243,0.382028,0.468819,0.351036,0.288768,0.611169,0.380254,0.332030,7.229569,3


In [None]:
np.column_stack(final_predictions).shape
preds = np.mean(np.column_stack(final_predictions), axis = 1)

In [None]:
sample_submission.target = preds
sample_submission.to_csv('submission.csv', index=False)