# 1. Problem definition

Predicting the amount of rain based on data from radars and gauges. The results are measured by Mean Absolute Error.

# 2. Data

Data is taken from https://www.kaggle.com/c/how-much-did-it-rain-ii/data

# 3. Evaluation

Model will be considered good if we reach Mean Absolute Error value of 25.5.

# 4. Features

The columns in the datasets are:

* Id:  A unique number for the set of observations over an hour at a gauge.
* minutes_past:  For each set of radar observations, the minutes past the top of the hour that the radar observations were carried out.  Radar observations are snapshots at that point in time.
* radardist_km:  Distance of gauge from the radar whose observations are being reported.
* Ref:  Radar reflectivity in km
* Ref_5x5_10th:   10th percentile of reflectivity values in 5x5 neighborhood around the gauge.
* Ref_5x5_50th:   50th percentile
* * Ref_5x5_90th:   90th percentile
* RefComposite:  Maximum reflectivity in the vertical column above gauge.  In dBZ.
* RefComposite_5x5_10th
* RefComposite_5x5_50th
* RefComposite_5x5_90th
* RhoHV:  Correlation coefficient (unitless)
* RhoHV_5x5_10th
* RhoHV_5x5_50th
* RhoHV_5x5_90th
* Zdr:    Differential reflectivity in dB
* Zdr_5x5_10th
* Zdr_5x5_50th
* Zdr_5x5_90th
* Kdp:  Specific differential phase (deg/km)
* Kdp_5x5_10th
* Kdp_5x5_50th
* Kdp_5x5_90th
* Expected:  Actual gauge observation in mm at the end of the hour.

# 5. Modelling

For regression problem, we will use SGD Regressor, RandomForestRegressor, XGBRegressor and Lasso Regression.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pprint import pprint
%matplotlib inline

# Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Models
from sklearn.linear_model import SGDRegressor, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

# Metrics
from sklearn.metrics import mean_absolute_error

In [None]:
df = pd.read_csv('../input/how-much-did-it-rain-ii/train.zip')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.groupby(['Id']).mean().head()

In [None]:
# Show chart with tpo ten features with missing values
fig, ax = plt.subplots(figsize = (15, 5))
df_na = (df.isnull().sum() / len(df))
df_na = df_na.drop(df_na[df_na == 0].index).sort_values(ascending = False)[: 10]
ax.bar(range(df_na.size), df_na, width = 0.5)
plt.xticks(range(df_na.size), df_na.index, rotation = 0)
plt.ylim([0, 1])
plt.title('Top ten features with the most missing values')
plt.ylabel('Missing ratio')
plt.show()

## NaN are very small values, we can fill these with 0

In [None]:
df.fillna(0, inplace = True)

In [None]:
df.isna().sum()

In [None]:
df['Ref_5x5_10th'].hist()

In [None]:
pd.set_option('display.float_format', lambda x: '%.3f' % x)

### Let's see correlation between variables

In [None]:
corr_mat = df.corr()
corr_mat.style.background_gradient(cmap='coolwarm')

In [None]:
f = plt.figure(figsize=(10, 10))
plt.matshow(corr_mat, fignum=f.number)
plt.colorbar()

In [None]:
df[["Expected"]].describe()

### We can see that mean is 108.626, but 75% percentile is only 3.810. Let's check outliers.

In [None]:
# On top we can see a lot of outliers, which disturb our scores
plt.figure(figsize=(10, 10))
plt.scatter(np.arange(len(df["Expected"].unique())), df["Expected"].unique())

In [None]:
from scipy import stats

print(stats.percentileofscore(df["Expected"], 106))

In [None]:
# Exclude the outliers
df.drop(df[df["Expected"] >= 106].index, inplace=True)
df.head()

In [None]:
# No outliers now, looks much better
plt.figure(figsize=(15, 10))
plt.scatter(np.arange(len(df["Expected"].unique())), df["Expected"].unique())

### We have to work on grouped data (by 'Id' column)

In [None]:
df_grouped = pd.DataFrame(df.groupby(['Id']).mean())

In [None]:
df_grouped.shape

In [None]:
df_grouped.head()

In [None]:
target = df_grouped['Expected']

In [None]:
target.head(10)

# Prepare the data

We can use a sample of whole dataset to see which operations on data are most successful.

In [None]:
small_df = df_grouped.sample(n = 200000, random_state = 42)

In [None]:
small_df.shape

In [None]:
small_df.isna().sum()

In [None]:
std_scaler = StandardScaler()

In [None]:
small_X = small_df.drop(['minutes_past', 'Expected'], axis = 1)
small_y = small_df['Expected']

In [None]:
s_X_train, s_X_test, s_y_train, s_y_test = train_test_split(small_X, small_y, test_size = 0.2, random_state = 42)

In [None]:
s_X_train_sc = std_scaler.fit_transform(s_X_train)

In [None]:
s_X_test_sc = std_scaler.transform(s_X_test)

In [None]:
s_X_train_sc.shape, s_X_test_sc.shape

In [None]:
s_y_train.shape, s_y_test.shape

# Modelling

 We will use SGD Regressor, RandomForestRegressor, XGBRegressor and Lasso Regression.

## SGD

In [None]:
sgd = SGDRegressor(random_state = 42)

In [None]:
sgd.fit(s_X_train_sc, s_y_train)

In [None]:
sgd.score(s_X_test_sc, s_y_test)

In [None]:
sgd_preds = sgd.predict(s_X_test_sc)

In [None]:
mean_absolute_error(s_y_test, sgd_preds)

## Random Forest Regressor

In [None]:
rf = RandomForestRegressor(random_state = 42)

In [None]:
rf.fit(s_X_train_sc, s_y_train)

In [None]:
rf_preds = rf.predict(s_X_test_sc)

In [None]:
mean_absolute_error(s_y_test, rf_preds)

## XGB Regressor

In [None]:
xgb = XGBRegressor(random_state = 42)

In [None]:
xgb.fit(s_X_train_sc, s_y_train)

In [None]:
xgb_preds = xgb.predict(s_X_test_sc)

In [None]:
mean_absolute_error(s_y_test, xgb_preds)

## Lasso

In [None]:
lasso = Lasso(random_state = 42)

In [None]:
lasso.fit(s_X_train_sc, s_y_train)

In [None]:
lasso_preds = lasso.predict(s_X_test_sc)

In [None]:
mean_absolute_error(s_y_test, lasso_preds)

## Random Forest Regressor and XGB Regressor provided best results, but let's focus on XGB Regressor as there are memory issues with Random Forest Regressor when using RandomizedSearchCV or running a model on full data.

### XGB Regressor Randomized Search CV

In [None]:
xgb_param_grid = {"learning_rate"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30] ,
                  "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
                  "min_child_weight" : [ 1, 3, 5, 7],
                  "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4],
                  "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7],
                  "n_estimators"     : [100, 400, 700, 1000]}

In [None]:
xgb_rscv = RandomizedSearchCV(xgb, param_distributions = xgb_param_grid, n_iter = 8, cv = 3, scoring = 'neg_mean_absolute_error', random_state = 42)

In [None]:
xgb_rscv.fit(s_X_train_sc, s_y_train)

In [None]:
xgb_rscv.score(s_X_test_sc, s_y_test)

In [None]:
xgb_rscv.score(s_X_train_sc, s_y_train)

In [None]:
xgb_rscv_preds = xgb_rscv.predict(s_X_test_sc)

In [None]:
mean_absolute_error(s_y_test, xgb_rscv_preds)

In [None]:
xgb_rscv.best_params_

### Let's build new models with learned parameters on full data

In [None]:
df_grouped = pd.DataFrame(df.groupby(['Id']).mean())

In [None]:
X = df_grouped.drop(['minutes_past', 'Expected'], axis = 1)
y = df_grouped['Expected']

In [None]:
X.shape, y.shape

In [None]:
std_scaler_full = StandardScaler()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, random_state = 42)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X_train_sc = std_scaler_full.fit_transform(X_train)

In [None]:
X_test_sc = std_scaler_full.transform(X_test)

### XGB Regressor on full data with parameters learned during Randomized Search CV

In [None]:
xgb_new_params = XGBRegressor(n_estimators = 100,
                              min_child_weight = 7,
                              max_depth = 10,
                              learning_rate = 0.05,
                              gamma = 0.1,
                              colsample_bytree = 0.3,
                              random_state = 42)

In [None]:
xgb_new_params.fit(X_train_sc, y_train)

In [None]:
xgb_new_params_preds = xgb_new_params.predict(X_test_sc)

In [None]:
mean_absolute_error(y_test, xgb_new_params_preds)

## Make predictions on test data

In [None]:
test = pd.read_csv('../input/how-much-did-it-rain-ii/test.zip')

In [None]:
test.isna().sum()

In [None]:
test.fillna(0, inplace = True)

In [None]:
small_X.columns

In [None]:
test.columns

In [None]:
test = pd.DataFrame(test.groupby(['Id']).mean())

In [None]:
test.shape

In [None]:
test.drop(['minutes_past'], axis = 1, inplace = True)

In [None]:
X_sub = std_scaler.transform(test)

In [None]:
X_sub.shape

In [None]:
predictions = xgb_new_params.predict(X_sub)

In [None]:
submission = pd.DataFrame(predictions, columns = ['Expected'])

In [None]:
submission.index.name = 'Id'

In [None]:
submission.index = submission.index + 1

In [None]:
submission.to_csv('rain_submission.csv')