In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# for visualizations
import matplotlib.pyplot as plt
import seaborn as sns
from pylab import rcParams

# for modeling 
import sklearn
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import plot_precision_recall_curve, precision_recall_curve
from sklearn.metrics import plot_confusion_matrix, classification_report, confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score, f1_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets, metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA

import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

import missingno as msno
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

We want you to set the 'RainToday' variable as the dependent variable and set up a model that predicts whether there will be rain or not. 
This model will be made using;

- 'Desicion Tree',
- 'Random Forest',
- 'LightGBM',
- 'GBM',
- 'XGBoost',
- 'CatBoost' models.

## DATA READING AND EXPLORING

In [None]:
d = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
df = d.copy().sample(10000)
df

In [None]:
df.info()

In [None]:
pd.DataFrame({"No. of unique values": list(df.nunique())}, index=df.columns)

In [None]:
df.duplicated().sum()

In [None]:
y = len(df[df['RainToday'] == 'Yes'])
n = len(df[df['RainToday'] == 'No'])
print(y,n)

In [None]:
df.describe().T

In [None]:
df.corr() #I can see from here which variables I can put into the model
          #For example, the variables can be removed by looking at the order of importance.

## SOME OF VISUALIZATION

In [None]:
#VISUALIZATION OF NAN  VALUES
msno.matrix(df)

In [None]:
#drop missing values in the RainToday and RainTomorrow
df.dropna(subset=['RainToday', 'RainTomorrow'],axis=0,inplace=True)

In [None]:
ax = df['RainTomorrow'].value_counts(normalize=True).plot.bar(color=["blue", "red"])
def labels(ax):
    for p in ax.patches:
        ax.annotate(f"%{p.get_height()*100:.2f}", (p.get_x() + 0.15, p.get_height() * 1.005),size=11)
labels(ax)

In [None]:
def summary(df):
    Types = df.dtypes
    Counts = df.apply(lambda x: x.count())
    Min = df.min()
    Max = df.max()
    Uniques = df.apply(lambda x: x.unique().shape[0]) # .shape[0] yazilmaz ise unique olan degerlerin listelerini getirir.
    Nulls = df.apply(lambda x: x.isnull().sum())

    cols = ['Types', 'Counts', 'Uniques', 'Nulls', 'Min', 'Max']
    str = pd.concat([Types, Counts, Uniques, Nulls, Min, Max], axis = 1, sort=True)

    str.columns = cols
    display(str.sort_values(by='Nulls', ascending=False))
    print('__________Data Types__________\n')
    print(str.Types.value_counts())
summary(df)

In [None]:
#label encoding for univariate variables
from sklearn.preprocessing import LabelEncoder

lbe = LabelEncoder()
df["RainToday_label"] = lbe.fit_transform(df["RainToday"])
df["RainTomorrow_label"] = lbe.fit_transform(df["RainTomorrow"])

In [None]:
# one-hot encoding for variables with more than 2 categories

#drop variables with so many countries for the sake of time and memory consumption
df.drop(['Date','Location','WindDir9am','WindDir3pm','WindGustDir'], axis=1, inplace=True) 

In [None]:
summary(df)

In [None]:
# DecisionTreeRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor

#drop unnecessary columns and date columns
df_imputation = df.drop(['RainToday','RainTomorrow'], axis=1) 

#define variables to keep the index and the columns
index = df_imputation.index
columns = df_imputation.columns

#imputation steps
imp_tree = IterativeImputer(random_state=0, estimator=DecisionTreeRegressor())
imp_tree.fit(df_imputation)
df_imputed = imp_tree.transform(df_imputation)

#transform imputed data in array format to dataframe
df_imputed_tree = pd.DataFrame(df_imputed, index=index, columns=columns)

df_imputed_tree.isnull().sum()

In [None]:
df_imputed_tree.info()

## 1-Decision Tree Classifier

In [None]:
df2 = df_imputed_tree.copy()
x_dat = df2.drop(['RainTomorrow_label'],axis=1)
y = df2["RainTomorrow_label"].values

In [None]:
X=(x_dat-np.min(x_dat))/(np.max(x_dat)-np.min(x_dat)).values

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score

In [None]:
dtc = DecisionTreeClassifier()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state =42)

cart_model = DecisionTreeClassifier(random_state=42).fit(X_train,y_train)
y_pred = cart_model.predict(X_test)
y_pred

In [None]:
dtc_score=accuracy_score(y_test,y_pred)
dtc_score

In [None]:
c_dtc=confusion_matrix(y_test,y_pred)
c_dtc

In [None]:
print(classification_report(y_test,y_pred))

## Decision Tree Model tuning

In [None]:
tree_grid = {"max_depth": range(1,10),
            "min_samples_split" : list(range(2,50)) }

In [None]:
tree1 = DecisionTreeClassifier()
tree_cv = GridSearchCV(tree1, tree_grid, cv = 10, n_jobs = -1, verbose = 2)
tree_cv_model = tree_cv.fit(X_train, y_train)

In [None]:
print("Best Parameters: " + str(tree_cv_model.best_params_))

In [None]:
tree1 = DecisionTreeClassifier(max_depth = 5, min_samples_split = 23)
tree_tuned1 = tree1.fit(X_train, y_train)

In [None]:
y_pred = tree_tuned1.predict(X_test)
dtc_finalscore=accuracy_score(y_test, y_pred)
dtc_finalscore

In [None]:
c_dtc2=confusion_matrix(y_test,y_pred)
c_dtc2

In [None]:
print(classification_report(y_test,y_pred))

## 2-Random Forest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model=RandomForestClassifier()
rf_model.fit(X_train,y_train)

In [None]:
y_pred = rf_model.predict(X_test)

In [None]:
rf_score=accuracy_score(y_test,y_pred)
rf_score

In [None]:
c_rf=confusion_matrix(y_test,y_pred)
c_rf

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
rf_model.predict(X_test)[0:10] # ilk 10 datatestdeki tahminler.

In [None]:
rf_model.predict_proba(X_test)[0:10] #1.si 0 olma 2.si 1 olma olasiligi oranlari.

In [None]:
from sklearn.ensemble import RandomForestClassifier 
score_list=[]
for each in range(1,75):
    rf2=RandomForestClassifier(n_estimators=each, random_state=42)
    rf2.fit(X_train, y_train)
    score_list.append(100*rf2.score(X_test, y_test))
    print("n_estimators=", each, "--> Accuracy:", 100*rf2.score(X_test, y_test), "%")

plt.plot([*range(1,75)], score_list)
plt.xlabel("n_estimators Value")
plt.ylabel("Accuracy %")
plt.show()

In [None]:
Importance = pd.DataFrame({"Importance": rf_model.feature_importances_*100},
                         index = X_train.columns)
Importance.sort_values(by = "Importance", 
                       axis = 0, 
                       ascending = True).plot(kind ="barh", color = "b")

plt.xlabel("Variable Importance Levels");

In [None]:
y_pred = rf2.predict(X_test)
rf_finalscore=accuracy_score(y_test, y_pred)
rf_finalscore

## 3-LightGBM

In [None]:
from lightgbm import LGBMClassifier

In [None]:
lgbm_model = LGBMClassifier().fit(X_train, y_train)

In [None]:
y_pred = lgbm_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

## Model Tuning of LightGBM

In [None]:
lgbm_params = {
        'n_estimators': [100, 500, 1000, 2000],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 4, 5,6],
        'learning_rate': [0.1,0.01,0.02,0.05],
        "min_child_samples": [5,10,20]}

In [None]:
lgbm = LGBMClassifier(learning_rate = 0.1, 
                       max_depth = 4,
                       subsample = 0.6,
                       n_estimators = 500,
                       min_child_samples = 10)
lgbm_tuned = lgbm.fit(X_train,y_train)

In [None]:
y_pred = lgbm_tuned.predict(X_test)
lgbm_finalscore=accuracy_score(y_test, y_pred)
lgbm_finalscore

In [None]:
c_lgbm=confusion_matrix(y_test,y_pred)
c_lgbm

In [None]:
print(classification_report(y_test,y_pred))

## 4-Gradient Boosting Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gbm_model = GradientBoostingClassifier().fit(X_train, y_train)

In [None]:
y_pred = gbm_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

## Model Tuning of GBM

In [None]:
gbm = GradientBoostingClassifier(learning_rate = 0.05, 
                                 max_depth = 10,
                                min_samples_split = 10,
                                n_estimators = 100)

In [None]:
gbm_tuned =  gbm.fit(X_train,y_train)

In [None]:
y_pred = gbm_tuned.predict(X_test)
gbm_finalscore=accuracy_score(y_test,y_pred)
gbm_finalscore

In [None]:
c_gbm=confusion_matrix(y_test,y_pred)
c_gbm

In [None]:
print(classification_report(y_test,y_pred))

## 5-XGBOOST

In [None]:
!pip install xgboost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb_model = XGBClassifier().fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

## Model Tuning of XGBoost

In [None]:
xgb = XGBClassifier(learning_rate = 0.01, 
                    max_depth = 6,
                    n_estimators = 100,
                    subsample = 0.8)
xgb_tuned =  xgb.fit(X_train,y_train)
y_pred = xgb_tuned.predict(X_test)
XGBoost_finalscore=accuracy_score(y_test, y_pred)
XGBoost_finalscore

In [None]:
c_xgb=confusion_matrix(y_test,y_pred)
c_xgb

In [None]:
print(classification_report(y_test,y_pred))

## 6-Catboost

In [None]:
!pip install catboost

In [None]:
from catboost import CatBoostClassifier, Pool

In [None]:
cat = CatBoostClassifier()

In [None]:
cat.fit(X_train, y_train)
y_pred = cat.predict(X_test)

In [None]:
cat_finalscore = accuracy_score(y_test, y_pred)
cat_finalscore

In [None]:
c_cat=confusion_matrix(y_test,y_pred)
c_cat

In [None]:
print(classification_report(y_test,y_pred))

In [None]:
print(dtc_finalscore,rf_finalscore,lgbm_finalscore, gbm_finalscore, XGBoost_finalscore, cat_finalscore)

### ----> Best Model is CatBOOST <----

In [None]:
idx = ["DTM","RFM","LGBM", "GBM", "XGBM", "CATBM"]
regressions = [rf_finalscore,dtc_finalscore, gbm_finalscore, XGBoost_finalscore, lgbm_finalscore,cat_finalscore,]

plt.figure(figsize=(6,4))
sns.barplot(x=idx,y=regressions)
plt.xticks()
plt.title('Model Comparision',color = 'orange',fontsize=20);