### 1. Data Understanding and Exploration

Let's first have a look at the dataset and understand the size, attribute names etc.

This is my first Kaggle Competition, Code, Code etc.

Let me know if i'm doing something wrong.
I'm just starting out with vanilla models and now building up from that.

Happy Kaggling


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import linear_model, metrics
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV

import os

# hide warnings
import warnings
warnings.filterwarnings('ignore')


In [None]:
# reading the dataset
train = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/train.csv")
train.head(5)

In [None]:
# reading the test dataset
test = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/test.csv")
test.head(5)

target_carbon_monoxide, target_benzene, and target_nitrogen_oxides are the target columns

In [None]:
# let's take a look at the info
train.info()

In [None]:
# let's look at the shape here
train.shape

In [None]:
# let's look at the test data
test.shape

In [None]:
#all train columns
train.columns

In [None]:
#all test columns
test.columns

## Step 2: Visualising the Data

Let's now spend some time doing what is arguably the most important step - **understanding the data**.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
sns.pairplot(train)
plt.show()

In [None]:
#Taking a look at correlations
corrmat = train.corr()
plt.figure(figsize=(20,20))
corrmat

In [None]:
plt.figure(figsize=(20,20))
#plot heatmap
sns.heatmap(train.corr(), annot = True, cmap="RdYlGn")

For each targets let's note most correlated feature
    1. target_carbon_monoxide => sensor1, sensor5
    2. target_benzene => sensor2, sensor5
    3. target_nitrogen_oxides => sensor5

In [None]:
fig, axs = plt.subplots(figsize=(20,6), ncols=1, nrows=1, sharex=False)


axs.plot(train["date_time"], train['target_benzene'])
axs.set_title("Benzene")
axs.set_ylabel("target_benzene")
axs.set_xlabel("Date")
axs.grid(axis="both")

plt.show();

In [None]:
fig, axs = plt.subplots(figsize=(20,6), ncols=1, nrows=1, sharex=False)


axs.plot(train["date_time"], train['target_carbon_monoxide'])
axs.set_title("Carbon_Monoxide")
axs.set_ylabel("target_carbon_monoxide")
axs.set_xlabel("Date")
axs.grid(axis="both")

plt.show();

In [None]:
fig, axs = plt.subplots(figsize=(20,6), ncols=1, nrows=1, sharex=False)


axs.plot(train["date_time"], train['target_nitrogen_oxides'])
axs.set_title("Nitrogen_Oxides")
axs.set_ylabel("target_nitrogen_oxides")
axs.set_xlabel("Date")
axs.grid(axis="both")

plt.show();

Plotting Histogram

In [None]:
fig, axs = plt.subplots(figsize=(10,6), ncols=1, nrows=1, sharex=False)


axs.hist(train['target_nitrogen_oxides'], bins = 50)
axs.set_title("Nitrogen_Oxides")
axs.set_xlabel("Nitrogen Oxide Levels")
axs.grid(axis="both")

plt.show();

In [None]:
fig, axs = plt.subplots(figsize=(10,6), ncols=1, nrows=1, sharex=False)


axs.hist(train['target_benzene'], bins = 50)
axs.set_title("Benzene")
axs.set_xlabel("Benzene Levels")
axs.grid(axis="both")

plt.show();

In [None]:
fig, axs = plt.subplots(figsize=(10,6), ncols=1, nrows=1, sharex=False)


axs.hist(train['target_carbon_monoxide'], bins = 50)
axs.set_title("Carbon Monoxide")
axs.set_xlabel("Carbon Monoxide Levels")
axs.grid(axis="both")

plt.show();

In [None]:
targetdf = train[['target_carbon_monoxide','target_benzene','target_nitrogen_oxides']]
targetdf.shape

In [None]:
sns.pairplot(targetdf)
plt.show()

## Step 3:-  Handling Data/ Preprocessing

In [None]:
#looking at the datetime format
train['date_time'].head(5)

In [None]:
# Creating datetime variables
# takes in a df, adds date/time based columns to it, and returns the modified df
def timeFeatures(df):
    # Derive new features using the click_time column
    df['datetime'] = pd.to_datetime(df['date_time'])
    df['day_of_week'] = df['datetime'].dt.dayofweek
    df["day_of_year"] = df["datetime"].dt.dayofyear
    df["month"] = df["datetime"].dt.month
    df["hour"] = df["datetime"].dt.hour
    df = df.drop(['date_time','datetime'], axis = 1)
    return df

In [None]:
train = timeFeatures(train)
test = timeFeatures(test)

In [None]:
train.head(5)

In [None]:
target_carbon_monoxide = train['target_carbon_monoxide'].values.reshape(-1,1)
target_benzene = train['target_benzene'].values.reshape(-1,1)
target_nitrogen_oxides = train['target_nitrogen_oxides'].values.reshape(-1,1)
train_df = train
train = train.drop(['target_carbon_monoxide','target_benzene','target_nitrogen_oxides'], axis = 1)

## Step 4:- Model Building and Evaluation

#### 1. Ridge Regression

In [None]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


ridge = Ridge()

# cross validation
folds = 5
model_cv_carbon_monoxide = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv_carbon_monoxide.fit(train, target_carbon_monoxide) 

In [None]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


ridge = Ridge()

# cross validation
folds = 5
model_cv_benzene = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv_benzene.fit(train, target_benzene) 

In [None]:
# list of alphas to tune
params = {'alpha': [0.0001, 0.001, 0.01, 0.05, 0.1, 
 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 2.0, 3.0, 
 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 20, 50, 100, 500, 1000 ]}


ridge = Ridge()

# cross validation
folds = 5
model_cv_nitrogen_oxide = GridSearchCV(estimator = ridge, 
                        param_grid = params, 
                        scoring= 'neg_mean_absolute_error', 
                        cv = folds, 
                        return_train_score=True,
                        verbose = 1)            
model_cv_nitrogen_oxide.fit(train, target_nitrogen_oxides) 

In [None]:
target_names = ['carbon_monoxide','benzene','nitrogen_oxides']

In [None]:
cv_results = pd.DataFrame(model_cv_nitrogen_oxide.cv_results_)
cv_results = cv_results[cv_results['param_alpha']<=200]
cv_results.head()

In [None]:
def display_results(model_cv):
    # plotting mean test and train scoes with alpha 
    cv_results['param_alpha'] = cv_results['param_alpha'].astype('int32')

    # plotting
    plt.plot(cv_results['param_alpha'], cv_results['mean_train_score'])
    plt.plot(cv_results['param_alpha'], cv_results['mean_test_score'])
    plt.xlabel('alpha')
    plt.ylabel('Negative Mean Absolute Error')
    plt.title("Negative Mean Absolute Error and alpha")
    plt.legend(['train score', 'test score'], loc='upper left')
    plt.show()

In [None]:
display_results(model_cv_nitrogen_oxide)

In [None]:
display_results(model_cv_benzene)

In [None]:
display_results(model_cv_carbon_monoxide)

In [None]:
alpha = 100
ridge = Ridge(alpha=alpha)

model_co = ridge.fit(train, target_carbon_monoxide)
model_no = ridge.fit(train, target_nitrogen_oxides)
model_benz = ridge.fit(train, target_benzene)

In [None]:
submission_csv = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv")

In [None]:
submission_csv.head(5)

In [None]:
#submission_csv["target_carbon_monoxide"] = model_co.predict(test)

In [None]:
#submission_csv["target_nitrogen_oxides"] = model_no.predict(test)

In [None]:
#submission_csv["target_benzene"] = model_benz.predict(test)

In [None]:
#submission_csv.head(5)

In [None]:
#outputting csv
#submission_csv.to_csv("submission_csv_ridge.csv", index=False)

We got a 2.08 Score with Ridge Regression

#### 2. Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [None]:
dt = DecisionTreeRegressor(random_state=42)

In [None]:
np.random.seed(0)
df_train, df_test = train_test_split(train_df, train_size=0.7, test_size = 0.3, random_state = 100)

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train.describe()

In [None]:
X_train = df_train.drop(['target_carbon_monoxide','target_benzene','target_nitrogen_oxides'], axis = 1)
y_co = df_train['target_carbon_monoxide']
y_benz = df_train['target_benzene']
y_no = df_train['target_nitrogen_oxides']

In [None]:
#creating trees
dt_co = DecisionTreeRegressor(random_state=42)
dt_no = DecisionTreeRegressor(random_state=42)
dt_benz = DecisionTreeRegressor(random_state=42)

In [None]:
#fitting trees
dt_co = dt_co.fit(X_train,y_co)
dt_no = dt_no.fit(X_train,y_no)
dt_benz = dt_benz.fit(X_train,y_benz)

In [None]:
y_train_pred_benz = dt_benz.predict(X_train)
y_train_pred_co = dt_co.predict(X_train)
y_train_pred_no = dt_no.predict(X_train)

In [None]:
from sklearn.metrics import r2_score
r2_score(y_co, y_train_pred_co)

In [None]:
r2_score(y_no, y_train_pred_no)

In [None]:
r2_score(y_benz, y_train_pred_benz)

In [None]:
y_test_pred_benz = dt_benz.predict(test)
y_test_pred_co = dt_co.predict(test)
y_test_pred_no = dt_no.predict(test)

In [None]:
submission_csv = pd.read_csv("/kaggle/input/tabular-playground-series-jul-2021/sample_submission.csv")

In [None]:
submission_csv["target_carbon_monoxide"] = dt_co.predict(test)

In [None]:
submission_csv["target_nitrogen_oxides"] = dt_no.predict(test)

In [None]:
submission_csv["target_benzene"] = dt_benz.predict(test)

In [None]:
submission_csv.head(5)

In [None]:
#outputting csv
#submission_csv.to_csv("submission_csv_dt.csv", index=False)

Vanilla DT Gave us a score of 0.48022