In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd

In [None]:
import sys

assert sys.version_info >= (3, 7)

In [None]:
from packaging import version
import sklearn

assert version.parse(sklearn.__version__) >= version.parse("1.0.1")

In [None]:
import tensorflow as tf

assert version.parse(tf.__version__) >= version.parse("2.8.0")

In [None]:
# Is this notebook running on Colab or Kaggle?
IS_COLAB = "google.colab" in sys.modules
IS_KAGGLE = "kaggle_secrets" in sys.modules

if not tf.config.list_physical_devices('GPU'):
    print("No GPU was detected. Neural nets can be very slow without a GPU.")
    if IS_COLAB:
        print("Go to Runtime > Change runtime and select a GPU hardware "
              "accelerator.")
    if IS_KAGGLE:
        print("Go to Settings > Accelerator and select GPU.")

In [None]:
def wrangle(filepath):
    # Read CSV file
    df = pd.read_csv(filepath)
    median = df["LST"].median()
    median = df["AAI"].median()
    median = df["CloudFraction"].median()
    median = df["NO2_strat"].median()
    median = df["NO2_total"].median()
    median = df["NO2_trop"].median()
    median = df["TropopausePressure"].median()
    #median = df["GT_NO2"].median()

    df["LST"].fillna(median, inplace=True)  # option 3
    df["AAI"].fillna(median, inplace=True)
    df["CloudFraction"].fillna(median, inplace=True)
    df["NO2_strat"].fillna(median, inplace=True)  # option 3
    df["NO2_total"].fillna(median, inplace=True)
    df["NO2_trop"].fillna(median, inplace=True)
    df["TropopausePressure"].fillna(median, inplace=True)  # option 3
    #df["GT_NO2"].fillna(median, inplace=True)




    return df

In [None]:
dfTin = wrangle(r"/content/Train (2).csv")
dfTst = wrangle(r"/content/Test (2).csv")
median = dfTin["GT_NO2"].median()
dfTin["GT_NO2"].fillna(median, inplace=True)
print(dfTin.info(), dfTst.info())
dfTin.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86584 entries, 0 to 86583
Data columns (total 14 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   ID_Zindi            86584 non-null  object 
 1   Date                86584 non-null  object 
 2   ID                  86584 non-null  object 
 3   LAT                 86584 non-null  float64
 4   LON                 86584 non-null  float64
 5   Precipitation       86584 non-null  float64
 6   LST                 86584 non-null  float64
 7   AAI                 86584 non-null  float64
 8   CloudFraction       86584 non-null  float64
 9   NO2_strat           86584 non-null  float64
 10  NO2_total           86584 non-null  float64
 11  NO2_trop            86584 non-null  float64
 12  TropopausePressure  86584 non-null  float64
 13  GT_NO2              86584 non-null  float64
dtypes: float64(11), object(3)
memory usage: 9.2+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6576

Unnamed: 0,ID_Zindi,Date,ID,LAT,LON,Precipitation,LST,AAI,CloudFraction,NO2_strat,NO2_total,NO2_trop,TropopausePressure,GT_NO2
0,ID_ENTGC7,1/1/19,PD01,45.601585,11.903551,0.0,16711.19057,0.230527,0.559117,2.4e-05,0.000117,16711.19057,14440.82126,31.0
1,ID_8JCCXC,1/1/19,PD04,45.371005,11.84083,3.047342,16711.19057,-0.074006,0.869309,2.4e-05,0.000127,16711.19057,14441.79815,42.0
2,ID_V3136Z,1/1/19,RO01,45.045825,12.060869,0.0,16711.19057,0.02447,0.67416,2.4e-05,8.6e-05,16711.19057,14437.38294,31.0
3,ID_KRVZDJ,1/1/19,RO02,45.104075,11.553241,1.200467,16711.19057,-0.010442,0.920054,2.4e-05,0.000124,16711.19057,14440.83831,30.0
4,ID_PR351A,1/1/19,RO03,45.038758,11.790152,1.274564,16711.19057,-0.176178,0.747464,2.4e-05,0.000116,16711.19057,14438.79037,58.0


In [None]:
dfTin.isnull().sum()

ID_Zindi              0
Date                  0
ID                    0
LAT                   0
LON                   0
Precipitation         0
LST                   0
AAI                   0
CloudFraction         0
NO2_strat             0
NO2_total             0
NO2_trop              0
TropopausePressure    0
GT_NO2                0
dtype: int64

In [None]:
dfTst.isnull().sum()

ID_Zindi              0
Date                  0
ID                    0
LAT                   0
LON                   0
Precipitation         0
LST                   0
AAI                   0
CloudFraction         0
NO2_strat             0
NO2_total             0
NO2_trop              0
TropopausePressure    0
dtype: int64

In [None]:
dfTin.shape

(86584, 14)

In [None]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(dfTin, test_size=0.2, random_state=42)

In [None]:
target =  "GT_NO2"
X = dfTin.drop(columns=target)
y = dfTin[target]

In [None]:
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])

In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
cat_pipeline = make_pipeline(
    SimpleImputer(strategy="most_frequent"),
    OneHotEncoder(handle_unknown="ignore"))

In [None]:
from sklearn.compose import ColumnTransformer

num_attribs = ["LAT","LON", "Precipitation","LST","AAI","CloudFraction","NO2_strat","NO2_total","NO2_trop","TropopausePressure"]
cat_attribs = ["ID_Zindi","Date","ID"]

preprocess_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
        ("cat", cat_pipeline, cat_attribs),
    ])

In [None]:
X.shape

(86584, 13)

In [None]:
import numpy as np
from sklearn.compose import make_column_selector, make_column_transformer

preprocessing = make_column_transformer(
    (num_pipeline, make_column_selector(dtype_include=np.number)),
    (cat_pipeline, make_column_selector(dtype_include=object)),
)

In [None]:
X_train = preprocessing.fit_transform(X)
X_train.shape

(86584, 87767)

In [None]:
from sklearn.linear_model import LinearRegression

lin_reg = make_pipeline(preprocessing, LinearRegression())
lin_reg.fit(X, y)

In [None]:
predictions = lin_reg.predict(X)
predictions[:5].round(-2)  # -2 = rounded to the nearest hundred

array([  0.,   0.,   0.,   0., 100.])

In [None]:
y.iloc[:5].values

array([31., 42., 31., 30., 58.])

In [None]:
# extra code – computes the error ratios discussed in the book
error_ratios = predictions[:5].round(-2) / y.iloc[:5].values - 1
print(", ".join([f"{100 * ratio:.1f}%" for ratio in error_ratios]))

-100.0%, -100.0%, -100.0%, -100.0%, 72.4%


In [None]:
from sklearn.metrics import mean_squared_error

lin_rmse = mean_squared_error(y, predictions,
                              squared=False)
lin_rmse

0.005871785406565399

In [None]:
# extra code – computes the error stats for the linear model
from sklearn.model_selection import cross_val_score
lin_rmses = -cross_val_score(lin_reg, X, y,
                              scoring="neg_root_mean_squared_error", cv=10)
pd.Series(lin_rmses).describe()

count    10.000000
mean     14.911975
std       4.301218
min      10.376154
25%      11.968359
50%      13.548760
75%      15.885150
max      22.405740
dtype: float64

In [None]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = make_pipeline(preprocessing, DecisionTreeRegressor(random_state=42))
tree_reg.fit(X, y)

In [None]:
tree_predictions = tree_reg.predict(X)
tree_rmse = mean_squared_error(y, tree_predictions,
                              squared=False)
tree_rmse

0.0

In [None]:
tree_rmses = -cross_val_score(tree_reg, X, y,
                              scoring="neg_root_mean_squared_error", cv=10)

In [None]:
pd.Series(tree_rmses).describe()

count    10.000000
mean     15.436670
std       4.573534
min      10.671189
25%      11.780460
50%      13.827006
75%      18.039793
max      22.853842
dtype: float64

In [None]:
#model makes a prediction
preds = lin_reg.predict(dfTst)
# Convert it to a dataframe
preds = pd.DataFrame(preds)
#get a sample of the preds dataframe
preds.head()

Unnamed: 0,0
0,30.096329
1,35.513155
2,37.605949
3,30.572238
4,32.520586


In [None]:
# Here we are converting the submission data to a dataframe
submission = pd.DataFrame({
        "ID_Zindi": dfTst["ID_Zindi"],
        "GT_NO2": preds[0]
    })

In [None]:
#here we are converting to a csv file
submission.to_csv('MY_submission27T.csv', index=False)

In [None]:
submission.sample(10)

Unnamed: 0,ID_Zindi,GT_NO2
3165,ID_KQX9JD,23.881475
2434,ID_K5HJPX,16.364914
3018,ID_JXLF1F,20.161276
458,ID_MCS1KG,27.902749
611,ID_M30885,31.585219
4884,ID_C3I091,19.952976
1183,ID_RRZKD6,25.794033
3016,ID_DGE4QY,23.669867
2361,ID_M79OLQ,20.425897
5386,ID_D8OLU5,22.701859


In [None]:
from sklearn.ensemble import RandomForestRegressor

forest_reg = make_pipeline(preprocessing,
                           RandomForestRegressor(random_state=42))

In [None]:
forest_reg.fit(X, y)
forest_predictions = forest_reg.predict(X)
forest_rmse = mean_squared_error(y, forest_predictions,
                                 squared=False)
forest_rmse

KeyboardInterrupt: 