In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb # model
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

In [2]:
train_df = pd.read_csv(r'C:\Users\miyay\Desktop\kaggle\Binary_Prediction_with_a_Rainfall_Dataset\Dataset\train.csv')
test_df = pd.read_csv(r'C:\Users\miyay\Desktop\kaggle\Binary_Prediction_with_a_Rainfall_Dataset\Dataset\test.csv')

In [3]:
test_df.head()

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
0,2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
1,2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2,2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
3,2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
4,2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


In [4]:
train_df.head()

# data distribution
# 1: 1650	0.7534246575342466
# 0: 540	0.2465753424657534

Unnamed: 0,id,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
0,0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


In [5]:
train_x = train_df.loc[:,['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']]
train_y = train_df.loc[:,'rainfall']
test_x = test_df.loc[:,['pressure', 'maxtemp', 'temparature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed']]

# add_feature
## diff_temp = maxtemp - mintemp
train_x["diff_temp"] = (train_x["maxtemp"] - train_x["mintemp"])
test_x["diff_temp"]  = (test_x["maxtemp"] - test_x["mintemp"])
## hpd = humidity / dewpoint
train_x["hpd"]        = train_x["humidity"]/(train_x["dewpoint"] + 1e-9)
test_x["hpd"]         = test_x["humidity"]/(test_x["dewpoint"] + 1e-9)
## var_temp = ((maxtemp-temp)**2 + (mintemp-temp)**2) / 2
train_x["var_temp"]  = ((train_x["maxtemp"]-train_x["temparature"])**2 + \
                        (train_x["mintemp"]-train_x["temparature"])**2)/2
test_x["var_temp"]   = ((test_x["maxtemp"]-test_x["temparature"])**2 + \
                        (test_x["mintemp"]-test_x["temparature"])**2)/2
## cps = cloud / sunshine
train_x["cps"] = train_x["cloud"]/(train_x["sunshine"] + 1e-9)
test_x["cps"] = test_x["cloud"]/(test_x["sunshine"] + 1e-9)

train_x.describe()

Unnamed: 0,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,diff_temp,hpd,var_temp,cps
count,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0
mean,1013.602146,26.365799,23.953059,22.170091,20.454566,82.03653,75.721918,3.744429,104.863151,21.804703,4.195708,4.655273,5.44984,13978540000.0
std,5.655366,5.65433,5.22241,5.05912,5.288406,7.800654,18.026498,3.626327,80.002416,9.898659,1.525268,9.296103,3.723848,32853080000.0
min,999.0,10.4,7.4,4.0,-0.3,39.0,2.0,0.0,10.0,4.4,-0.1,-150.000001,0.125,0.1960784
25%,1008.6,21.3,19.3,17.7,16.8,77.0,69.0,0.4,40.0,14.125,3.1,3.294118,2.75125,10.0
50%,1013.0,27.8,25.5,23.85,22.15,82.0,83.0,2.4,70.0,20.5,4.2,3.787234,4.88,34.49167
75%,1017.775,31.2,28.4,26.4,25.0,88.0,88.0,6.8,200.0,27.9,5.2,4.9375,7.3,220.0
max,1034.6,36.0,31.5,29.8,26.7,98.0,100.0,12.1,300.0,59.5,10.8,379.999998,38.77,100000000000.0


In [6]:
scaler = StandardScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# KFolds

In [7]:
# setting
FOLDS = 5
kf = KFold(n_splits=FOLDS, shuffle=True, random_state=42)
lgb_params = {"boosting_type":"gbdt",
              "objective": "binary",
              "metric": "auc", 
              "varbose": -1,
              "random_state": 42}
pred_train = np.zeros(len(train_df))
pred_val = np.zeros(len(test_df))

# fit_pred
for i, (train_idx, val_idx) in enumerate(kf.split(train_df)):
    print("#" * 20)
    print(f"# Fold {i}")
    print("#" * 20)

    X_train = train_x[train_idx,:]
    y_train = train_y[train_idx]
    X_val   = train_x[val_idx,:]
    y_val   = train_y[val_idx]

    # setting
    lgb_train = lgb.Dataset(X_train, y_train)
    lgb_val   = lgb.Dataset(X_val, y_val)
    
    # model train
    model = lgb.train(params=lgb_params, 
                      train_set=lgb_train, 
                      valid_sets=[lgb_train, lgb_val])

    # predict
    pred_train[val_idx] = model.predict(X_val)
    pred_val += model.predict(test_x)

# avg
pred_val /= FOLDS

# results
roc_auc_score(train_y, pred_train)

####################
# Fold 0
####################
[LightGBM] [Info] Number of positive: 1331, number of negative: 421
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000588 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2198
[LightGBM] [Info] Number of data points in the train set: 1752, number of used features: 14
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.759703 -> initscore=1.151053
[LightGBM] [Info] Start training from score 1.151053
####################
# Fold 1
####################
[LightGBM] [Info] Number of positive: 1306, number of negative: 446
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2224
[LightGBM] [Info] Number of data points in the train set: 1752, number of used fea

0.8765263748597082

In [8]:
submit = pd.DataFrame(test_df['id'])
submit['rainfall'] = pred_val

In [9]:
submit

Unnamed: 0,id,rainfall
0,2190,0.999270
1,2191,0.999100
2,2192,0.950919
3,2193,0.105055
4,2194,0.011691
...,...,...
725,2915,0.997989
726,2916,0.937842
727,2917,0.998670
728,2918,0.999394


In [10]:
submit.to_csv(r"./submission.csv", index=False)