In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

# For data visualization
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns; 
from sklearn.model_selection import train_test_split
from pylab import rcParams

from scipy import stats
from collections import Counter

# for modeling 
import sklearn
from xgboost.sklearn import XGBClassifier
from sklearn.metrics import confusion_matrix, classification_report, plot_precision_recall_curve, precision_recall_curve
from sklearn.metrics import roc_curve, roc_auc_score,  accuracy_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score, train_test_split, KFold
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets, metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, classification_report

import imblearn
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE


# Plotly for interactive graphics 
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

# to avoid warnings
import warnings
warnings.filterwarnings('ignore')
warnings.warn("this will not show")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
df =data.copy()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
nv=pd.concat([df.isnull().sum(), 100 * df.isnull().sum()/df.shape[0]],axis=1).rename(columns={0:'Missing_Records', 1:'Percentage (%)'})
nv[nv.Missing_Records>0].sort_values('Missing_Records', ascending=False)

In [None]:
df.dropna(subset=['RainToday', 'RainTomorrow'], inplace=True)
df[['RainToday', 'RainTomorrow']].isnull().sum()

In [None]:
def summary(df, pred=None):
    obs = df.shape[0]
    Types = df.dtypes
    Counts = df.apply(lambda x: x.count())
    Min = df.min()
    Max = df.max()
    Uniques = df.apply(lambda x: x.unique().shape[0])
    Nulls = df.apply(lambda x: x.isnull().sum())
    print('Data shape:', df.shape)

    if pred is None:
        cols = ['Types', 'Counts', 'Uniques', 'Nulls', 'Min', 'Max']
        str = pd.concat([Types, Counts, Uniques, Nulls, Min, Max], axis = 1, sort=True)

    str.columns = cols
    print('___________________________\nData Types:')
    print(str.Types.value_counts())
    print('___________________________')
    display(str.sort_values(by='Nulls', ascending=False))

summary(df)

In [None]:
df[['RainToday','RainTomorrow']] = df[['RainToday','RainTomorrow']].replace({'Yes':1, 'No':0})

In [None]:
df.dropna(inplace=True)

In [None]:
df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')
df['Year'] = df['Date'].dt.year.astype('int16')
df['Month'] = df['Date'].dt.month.astype('int16')
df.head()

In [None]:
# for all variables 
plt.figure(figsize=(20,10))
sns.heatmap(df.corr(),annot=True, cmap="coolwarm");

In [None]:
df.isnull().sum()

In [None]:
df.drop(columns="Date", axis=1, inplace=True)

In [None]:
df.sample(10)

In [None]:
for i in df.select_dtypes(include=np.number).columns.tolist():
    plt.figure()
    df.boxplot([i])

In [None]:
df = pd.get_dummies(df, drop_first=True, columns = ['Location','WindGustDir','WindDir9am','WindDir3pm'])

# Outliers

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split

y = df['RainTomorrow']
X = df.drop(['RainTomorrow'], axis=1)

clf = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.10, random_state=42)
clf.fit(X)
y_pred = clf.predict(X)

# the model will predict an inlier with a label of +1 and an outlier with a label of -1

outliers_values = X[clf.predict(X) == -1]
outliers_values

In [None]:
outliers_values = X[clf.predict(X) == -1]
f"{len(outliers_values)} rows are outliers"

In [None]:
df = X.join(y)[clf.predict(X) == 1]
df

## Creating Models

In [None]:
# separating the dependent and independent data
X=df.drop(["RainTomorrow"], axis=1)
y=df["RainTomorrow"]

# the function train_test_split creates random data samples (default: 75-25%)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state =42)

# getting the shapes
print(f"""shape of X_train: {X_train.shape}
shape of X_test\t: {X_test.shape}
shape of y_train: {y_train.shape}
shape of y_test\t: {y_test.shape}""")

1. Logistic  Regression

In [None]:
logreg = LogisticRegression(solver='liblinear', random_state=42)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
y_pred

In [None]:
def conf_matrix(model, X_test, y_test, cmap='Blues'):
    plot_confusion_matrix(model, X_test, y_test, cmap=cmap)
    plt.grid()
    plt.show()

def roc_curve_custom(model, X_test, y_test):
    plot_roc_curve(model, X_test, y_test)
    plt.plot([0, 1], [0, 1], color='black', linestyle='--')
    plt.show()
    
def evaluate(model, X_train=X_train, X_test=X_test, y_train=y_train, y_test=y_test, y_pred=y_pred):
    # Confusion Matrix
    print('Confusion Matrix')
    print('-'*53)
    conf_matrix(model, X_test, y_test)
    print('\n') 
    
    # Classification Report
    print('Classification Report') 
    print('-'*53)
    print(classification_report(y_test, y_pred))
    print('\n')
    
    # ROC Curve
    print('ROC Curve')
    print('-'*53)
    roc_curve_custom(model, X_test, y_test)
    print('\n')
    
    # Checking model fitness
    print('Checking model fitness') 
    print('-'*53)
    print('Train score:', round(model.score(X_train, y_train), 4))
    print('Test score: ', round(model.score(X_test, y_test), 4))
    print('\n')
    
evaluate(logreg)

In [None]:
X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train, y_train)

print('Original')
print('-'*20)
print(y_train.value_counts())
print('\n')
print('SMOTE')
print('-'*20)
print(pd.Series(y_train_resampled).value_counts())

In [None]:
logreg_smote = LogisticRegression(solver='liblinear', random_state=42)
logreg_smote.fit(X_train_resampled, y_train_resampled)
y_pred_smote = logreg_smote.predict(X_test)
y_pred_smote

In [None]:
evaluate(logreg_smote, X_train=X_train_resampled, y_train=y_train_resampled, y_pred=y_pred_smote)

### Tuning

In [None]:
logreg_params = {
    'C': [1, 1e8, 1e16],
    'fit_intercept': [True, False],
    'max_iter': [50, 100, 150],
    'random_state': [42]
}

logreg_tuned = GridSearchCV(logreg, logreg_params, scoring='accuracy', n_jobs=-1, cv=3)
logreg_tuned.fit(X_train, y_train)

In [None]:
print("The best score:" + str(round(logreg_tuned.best_score_, 4)))
print("The best parameters: " + str(logreg_tuned.best_params_))

In [None]:
y_pred_logreg_tuned = logreg_tuned.predict(X_test)
y_pred_logreg_tuned

In [None]:
evaluate(logreg_tuned, y_pred=y_pred_logreg_tuned)

# 2) Gaussian Naive Bayes 

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model = nb_model.fit(X_train, y_train)
nb_model

In [None]:
y_pred = nb_model.predict(X_test)

In [None]:
evaluate(nb_model, y_pred=y_pred)

In [None]:
nb_finalscore=cross_val_score(nb_model, X_test, y_test, cv = 10).mean()
nb_finalscore

## 3) KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier()
knn_model = knn_model.fit(X_train, y_train)
# ?knn_model

In [None]:
y_pred = knn_model.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
evaluate(knn_model, y_pred=y_pred)

In [None]:
knn_params = {"n_neighbors": np.arange(1,50)}
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_params, cv=10)
knn_cv.fit(X_train, y_train)

In [None]:
print("The best score:" + str(knn_cv.best_score_))
print("The best parameters: " + str(knn_cv.best_params_))

In [None]:
knn_tuned =KNeighborsClassifier(n_neighbors = 37)
knn_tuned = knn_tuned.fit(X_train,y_train)
y_pred = knn_tuned.predict(X_test)

knn_tuned_score=accuracy_score(y_test,y_pred)
knn_tuned_score

In [None]:
evaluate(knn_tuned, y_pred=y_pred)

## 4)SVM(SUPPORT VECTOR MACHINES)

In [None]:
svm = SVC(C=5,degree=9,kernel = 'poly')
svm.fit(X_train,y_train)
y_pred = svm.predict(X_test)
?svm

In [None]:
evaluate(svm, y_pred=y_pred)