# July Tabular PlayGround - Basline Models

#### Import required libraries and training data

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
df = pd.read_csv('../input/tabular-playground-series-jul-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')
df.head()


In [None]:
df.describe()

#### Import Supervised Learning Models

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as MSE

### Create new Features based on Dates

Borrowed from https://www.kaggle.com/junhyeok99/automl-pycaret

In [None]:
df['date_time_2'] = df['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9
#test['date_time'] = test['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9

In [None]:
df['date_time'] = pd.to_datetime(df['date_time'])
df['year'] = df['date_time'].dt.year
df['month'] = df['date_time'].dt.month
df['week'] = df['date_time'].dt.week
df['day'] = df['date_time'].dt.day
df['dayofweek'] = df['date_time'].dt.dayofweek
df['time'] = df['date_time'].dt.date - df['date_time'].dt.date.min()
df['hour'] = df['date_time'].dt.hour
df['time'] = df['time'].apply(lambda x : x.days)
df.drop(columns = 'date_time', inplace = True)

In [None]:
df.head(5)

In [None]:
x=df.drop(['target_benzene','target_nitrogen_oxides', 'target_carbon_monoxide'], axis=1)
y_bz=df['target_benzene']
y_no=df['target_nitrogen_oxides']
y_co=df['target_carbon_monoxide']

### Benzene

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 300, stop = 5000, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 3, 6, 10, 12]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 3, 2, 4, 10, 7]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}


random_grid = {'n_estimators': [2911],
 'min_samples_split': [3],
 'min_samples_leaf': [2],
 'max_features': ['auto'],
 'max_depth': [60],
 'bootstrap': [True]}


print(random_grid)


In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 500, cv = 10, verbose=2, random_state=42, n_jobs = -1, scoring = 'neg_mean_squared_error')
# Fit the random search model


In [None]:
import time

t_start = time.time()
rf_random.fit(x,y_bz)
t_stop = time.time()

print('Time elapsed: {:.3f} seconds'.format(t_stop - t_start))

##### Look into the Random Search's Best Model

In [None]:
rf_random.best_score_

# best neg mean squared error with no dates -1.728175972337029

#{'n_estimators': 2000,
# 'min_samples_split': 5,
# 'min_samples_leaf': 2,
# 'max_features': 'auto',
# 'max_depth': 80,
# 'bootstrap': True}

#new benzene with dates- -1.55
# {'n_estimators': 2911,
#  'min_samples_split': 3,
#  'min_samples_leaf': 2,
#  'max_features': 'auto',
#  'max_depth': 60,
#  'bootstrap': True}


In [None]:
rf_random.best_params_

In [None]:
rf_random.best_estimator_.feature_importances_

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
# Create a pd.Series of features importances
importances_rf = pd.Series(rf_random.best_estimator_.feature_importances_,
index = x.columns)
# Sort importances_rf
sorted_importances_rf = importances_rf.sort_values()
# Make a horizontal bar plot
sorted_importances_rf.plot(kind='barh', color='blue')
plt.show()

In [None]:
final_rf_bz = rf_random.best_estimator_

### Carbon Monoxide

In [None]:
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 500, stop = 4000, num = 6)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(5, 110, num = 8)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 3, 6, 10, 12]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 3, 2, 4, 10, 7]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid_co = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
# print(random_grid_co)


random_grid_co = {'n_estimators': [500],
 'min_samples_split': [6],
 'min_samples_leaf': [1],
 'max_features': ['sqrt'],
 'max_depth': [50],
 'bootstrap': [True]}


print(random_grid_co)





In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf_co = RandomForestRegressor()

rf_random_co = RandomizedSearchCV(estimator = rf_co, param_distributions = random_grid_co, n_iter = 400, cv = 5, verbose=2, random_state=42, n_jobs = -1,scoring = 'neg_mean_squared_error')
# Fit the random search model


#### Add the predicted Benzene values to test X

In [None]:
x_bz=x.join(y_bz.to_frame())
x_bz

In [None]:
import time

t_start = time.time()
rf_random_co.fit(x_bz,y_co)
t_stop = time.time()

print('Time elapsed: {:.3f} seconds'.format(t_stop - t_start))

##### Look into the Random Search's Best Model

In [None]:
rf_random_co.best_score_

#neg mean squared error with Benzen and parameter tuning and no dates - -0.3161621132509786
# {'n_estimators': 2000,
#  'min_samples_split': 3,
#  'min_samples_leaf': 2,
#  'max_features': 'sqrt',
#  'max_depth': 95,
#  'bootstrap': True}

#neg mean squared error with Benzen and parameter tuning and dates - -0.280873

# {'n_estimators': 500,
#  'min_samples_split': 6,
#  'min_samples_leaf': 1,
#  'max_features': 'sqrt',
#  'max_depth': 50,
#  'bootstrap': True}

In [None]:
rf_random_co.best_params_

In [None]:
rf_random_co.best_estimator_.feature_importances_

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
# Create a pd.Series of features importances
importances_rf = pd.Series(rf_random_co.best_estimator_.feature_importances_,
index = x_bz.columns)
# Sort importances_rf
sorted_importances_rf = importances_rf.sort_values()
# Make a horizontal bar plot
sorted_importances_rf.plot(kind='barh', color='blue')
plt.show()

In [None]:
final_rf_co = rf_random_co.best_estimator_

#### Nitrous

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# # Number of trees in random forest
# n_estimators = [int(x) for x in np.linspace(start = 200, stop = 3500, num = 10)]
# # Number of features to consider at every split
# max_features = ['auto', 'sqrt']
# # Maximum number of levels in tree
# max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
# max_depth.append(None)
# # Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# # Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# # Method of selecting samples for training each tree
# bootstrap = [True, False]
# # Create the random grid
# random_grid_no = {'n_estimators': n_estimators,
#                'max_features': max_features,
#                'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}
# print(random_grid_no)


random_grid_no = {'n_estimators': [200],
 'min_samples_split': [2],
 'min_samples_leaf': [1],
 'max_features': ['auto'],
 'max_depth': [80],
 'bootstrap': [True]}

print(random_grid_no)

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf_no = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random_no = RandomizedSearchCV(estimator = rf_no, param_distributions = random_grid_no, n_iter = 400, cv = 5, verbose=2, random_state=42, n_jobs = -1,scoring = 'neg_mean_squared_error')
# Fit the random search model


#### Add the predicted Carbon Monoxide values to test X

In [None]:
x_bz_co=x_bz.join(y_co.to_frame())
x_bz_co

In [None]:
import time

t_start = time.time()
rf_random_no.fit(x_bz_co,y_no)
t_stop = time.time()

print('Time elapsed: {:.3f} seconds'.format(t_stop - t_start))

In [None]:
rf_random_no.best_score_



#neg mean squared error with bz/co and parameter tuning and no dates - -8843.46
# {'n_estimators': 3500,
#  'min_samples_split': 2,
#  'min_samples_leaf': 2,
#  'max_features': 'auto',
#  'max_depth': 70,
#  'bootstrap': True}


#neg mean squared error with bz/co and parameter tuning and dates - -8079.27
# {'n_estimators': 200,
#  'min_samples_split': 2,
#  'min_samples_leaf': 1,
#  'max_features': 'auto',
#  'max_depth': 80,
#  'bootstrap': True}

In [None]:
rf_random_no.best_params_

In [None]:
rf_random_no.best_estimator_.feature_importances_

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
# Create a pd.Series of features importances
importances_rf = pd.Series(rf_random_no.best_estimator_.feature_importances_,
index = x_bz_co.columns)
# Sort importances_rf
sorted_importances_rf = importances_rf.sort_values()
# Make a horizontal bar plot
sorted_importances_rf.plot(kind='barh', color='blue')
plt.show()

In [None]:
final_rf_no = rf_random_no.best_estimator_

## Final Predictions for test data

* The data frame names are weird - but the code works

In [None]:
test

In [None]:
test_no_dates = test

In [None]:
test_no_dates.head(5)

In [None]:
test_no_dates['date_time_2'] = test_no_dates['date_time'].astype('datetime64[ns]').astype(np.int64)/10**9

test_no_dates['date_time'] = pd.to_datetime(test_no_dates['date_time'])
test_no_dates['year'] = test_no_dates['date_time'].dt.year
test_no_dates['month'] = test_no_dates['date_time'].dt.month
test_no_dates['week'] = test_no_dates['date_time'].dt.week
test_no_dates['day'] = test_no_dates['date_time'].dt.day
test_no_dates['dayofweek'] = test_no_dates['date_time'].dt.dayofweek
test_no_dates['time'] = test_no_dates['date_time'].dt.date - test_no_dates['date_time'].dt.date.min()
test_no_dates['hour'] = test_no_dates['date_time'].dt.hour
test_no_dates['time'] = test_no_dates['time'].apply(lambda x : x.days)
test_no_dates.drop(columns = 'date_time', inplace = True)

In [None]:
test_no_dates.describe()

### Predict Using trained models

#### Predict Benzene first

In [None]:
test_y_bz=final_rf_bz.predict(test_no_dates)

In [None]:
test_y_bz=pd.DataFrame(test_y_bz)
test_y_bz.columns=['test_benzene']
test_y_bz

In [None]:
test_no_dates_bz=test_no_dates.join(test_y_bz)

In [None]:
test_no_dates_bz

#### Predict Carbon Monoxide next

In [None]:
test_y_co=final_rf_co.predict(test_no_dates_bz)

In [None]:
test_y_co=pd.DataFrame(test_y_co)
test_y_co.columns=['test_carbon_monoxide']
test_y_co

In [None]:
test_no_dates_co_bz=test_no_dates_bz.join(test_y_co)

In [None]:
test_no_dates_co_bz

#### Finally Predict Nitrous

In [None]:
test_y_no=final_rf_no.predict(test_no_dates_co_bz)

#### Join all Predicted columns together

In [None]:
test_y_co

In [None]:
test_y_no

In [None]:
submit=test_y_co
#submit.columns=['target_carbon_monoxide']
submit=submit.join(test_y_bz, how='inner')
#submit.columns=['target_carbon_monoxide','target_benzene']
submit=submit.join(pd.DataFrame(test_y_no), how='inner')
submit.columns=['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']
submit

In [None]:
test2 = pd.read_csv('../input/tabular-playground-series-jul-2021/test.csv')

In [None]:
test2

In [None]:
submission=test2['date_time'].to_frame().join(submit, how='inner')

In [None]:
submission

In [None]:
submission.to_csv('20210711_RF.csv',index=False)