In [1]:
!pip install xgboost yellowbrick
import joblib 
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns

from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from yellowbrick.classifier import ClassificationReport

mpl.rcParams["figure.dpi"] = 300
%matplotlib inline
%config InlineBackend.figure_format ='retina'

Collecting xgboost
  Downloading xgboost-1.3.3-py3-none-manylinux2010_x86_64.whl (157.5 MB)
[K     |████████████████████████████████| 157.5 MB 32 kB/s s eta 0:00:01    |████████████████████            | 98.1 MB 78.6 MB/s eta 0:00:01
[?25hCollecting yellowbrick
  Downloading yellowbrick-1.3.post1-py3-none-any.whl (271 kB)
[K     |████████████████████████████████| 271 kB 78.5 MB/s eta 0:00:01
Installing collected packages: yellowbrick, xgboost
Successfully installed xgboost-1.3.3 yellowbrick-1.3.post1


In [2]:
dailydata = pd.read_csv('/home/ec2-user/SageMaker/DA-assignment/cleanedata.csv')

In [3]:
dailydata.head(10)

Unnamed: 0.1,Unnamed: 0,date,cbl,maxtp,mintp,rain,wdsp,Rain_Or_Not
0,0,2003-08-16,1013.7,20.1,7.5,0.0,10.163227,0
1,1,2003-08-17,1007.5,21.3,11.6,1.1,10.163227,1
2,2,2003-08-18,1008.8,20.3,8.5,0.0,10.163227,0
3,3,2003-08-19,1014.3,19.9,11.3,0.0,10.163227,0
4,4,2003-08-20,1013.6,21.5,10.8,0.0,10.163227,0
5,5,2003-08-21,1007.0,24.0,13.0,0.1,10.163227,1
6,6,2003-08-22,1009.0,20.6,14.0,4.9,10.163227,1
7,7,2003-08-23,1014.7,25.1,16.2,0.0,10.163227,0
8,8,2003-08-24,1016.9,22.6,15.1,0.0,10.163227,0
9,9,2003-08-25,1015.6,18.9,15.4,0.0,10.163227,0


In [4]:
numeric_features = [
        "cbl",
        "maxtp",
        "mintp",
        "rain",
        "wdsp",
]

numeric_transformer = Pipeline(
    steps=[("scaler", StandardScaler())]
)

preprocessor = ColumnTransformer(
    transformers=[("num", numeric_transformer, numeric_features)]
)

In [5]:
X = dailydata[
        [ 
          "cbl",
        "maxtp",
        "mintp",
        "rain",
        "wdsp",
         
        ]
]

y = dailydata['Rain_Or_Not']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.25, stratify=y)

In [7]:
print(X_train.shape)
print(X_test.shape)

(14722, 5)
(4908, 5)


In [8]:
print(y_train.shape)
print(y_test.shape)

(14722,)
(4908,)


In [9]:
dummy = DummyClassifier(strategy='stratified', random_state=42)

In [10]:
dummy.fit(X_train, y_train)
dummy_pred = dummy.predict(X_test)
print("Accuracy Score:", accuracy_score(y_test, dummy_pred))

Accuracy Score: 0.5209861450692747


In [11]:
models = {
    "LogReg": (
        Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                (
                    "clf",
                    LogisticRegression(
                        penalty="l2",
                        class_weight="balanced",
                        random_state=42,
                        verbose=1,
                        n_jobs=-1,
                    ),
                ),
            ]
        ),
        {
            "clf__penalty": ["l2"],
            "clf__C": np.logspace(1, 5, 10),
            "clf__solver": ["liblinear", "newton-cg", "sag", "saga", "lbfgs"],
            "clf__max_iter": [100, 500, 1000, 1500],
        },
    ),
    "XGB": (
        Pipeline(
            steps=[
                ("preprocessor", preprocessor),
                (
                    "clf",
                    XGBClassifier(
                        base_score=0.5,
                        objective="binary:logistic",
                        reg_lambda=0.1,
                        n_estimators=1000,
                        max_delta_step=0,
                        scale_pos_weight=1,
                        subsample=0.5,
                        random_state=42,
                        n_jobs=-1,
                        verbosity=1,
                    ),
                ),
            ]
        ),
        {
            "clf__gamma": [i / 10.0 for i in range(1, 10)],
            "clf__reg_alpha": [1e-5, 1e-2, 0.1, 1, 10],
            "clf__learning_rate": [0.01, 0.05, 0.1],
            "clf__min_child_weight": [i for i in range(2, 10)],
            "clf__max_depth": [10, 15, 20]
        },
    ),
}


In [None]:
for name, (model, parameters) in models.items():
    print("----- {} -----".format(name))

    gs = GridSearchCV(
        model,
        parameters,
        cv=5,
        n_jobs=-1,
        scoring="roc_auc",
        verbose=1,
        return_train_score=True,
    )

    gs.fit(X_train, y_train)
    print("Best Parameters:", gs.best_params_)
    print("")
    print("Best Score:", gs.best_score_)

    joblib.dump(gs.best_estimator_, f"{name}.pkl", compress=1)
    joblib.dump(gs.cv_results_, f"{name}_results.pkl", compress=1)

    y_pred = gs.predict(X_test)

    print("")
    print("Precision Score :", precision_score(y_test, y_pred))
    print("Recall Score:", recall_score(y_test, y_pred))
    print("f1 Score:", f1_score(y_test, y_pred))
    print("")