In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Simple Logistic Regression using scikit-learn
for detailed walkthrough, please feel free to read the full articles "https://medium.com/p/86bf984f61f1"

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from pandas.api.types import is_string_dtype, is_numeric_dtype

df = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")
df.head()

In [None]:
df.describe()
df.shape

# Handle Missing Values

In [None]:
df.isnull().sum()
missing_count = df.isnull().sum() # the count of missing values
value_count = df.isnull().count() # the count of all values 
missing_percentage = round(missing_count / value_count * 100, 1) #the percentage of missing values
missing_df = pd.DataFrame({'count': missing_count, 'percentage': missing_percentage}) #create a dataframe
print(missing_df)

In [None]:
# drop columns with a large of amount missing values
df = df.drop(['Evaporation', 'Sunshine', 'Cloud3pm', 'Cloud9am'], axis=1)

# drop rows with missing labels - RainTomorrow
df = df.dropna(subset = ["RainTomorrow"])

In [None]:
df.shape

In [None]:
num_list = []
cat_list = []

for column in df:
    if column != 'RainTomorrow': # separate lable
        if is_numeric_dtype(df[column]):
            num_list.append(column)
        elif is_string_dtype(df[column]):
            cat_list.append(column)


print(num_list)
print(cat_list)

In [None]:
# Numerical Variables: impute missing values with mean
df.fillna(df.mean(), inplace=True)

In [None]:
# Categorical Variables: replace missing values with "Unknown"
for i in (cat_list):
    if df[i].isnull().any():
        df[i].fillna("Unknown", inplace=True)

In [None]:
df.describe(include = 'all')

# Feature Engineering & EDA
* exploratory data analysis
* handle outliers
* date manipulation
* encoding categorical data

In [None]:
for column in df:
    plt.figure(column, figsize = (5,5))
    plt.title(column)
    if is_numeric_dtype(df[column]):
        df[column].plot(kind = 'hist')
    elif is_string_dtype(df[column]):
        # show only the TOP 10 value count in each categorical data
        df[column].value_counts()[:10].plot(kind = 'bar')

In [None]:
# address outliers in "Rainfall"
maximum = df['Rainfall'].quantile(0.9)
df = df[df["Rainfall"] < maximum]
df["Rainfall"].plot(kind = 'hist')
df.shape

In [None]:
# date manipulation
df['Month'] = pd.to_datetime(df['Date']).dt.month.apply(str)
df['Month'].value_counts().plot(kind = 'bar')

In [None]:
# encoding categorical data using dummies
from sklearn.preprocessing import LabelEncoder

categorical_features = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'RainToday', 'Month', 'RainTomorrow']

for i in categorical_features:
    df[i] = LabelEncoder().fit_transform(df[i])

In [None]:
# multivariate analysis
plt.figure(1, figsize = (15,15))
correlation = df.corr()
sns.heatmap(correlation, cmap = "GnBu", annot = True)

In [None]:
# select and rearrange columns
df = df[['Month','Location', 'MinTemp','MaxTemp', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm', 
         'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure3pm', 'RainToday', 'RainTomorrow']]

# Split Data

In [None]:
# X - input features matrix: select all rows using ":" and select all columns before the last one using":-1"
X = df.iloc[:,:-1]

# y - output target vector: select all rows using ":" and select the last column using "-1"
y = df["RainTomorrow"]

In [None]:
# split into train and test set
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

# Model Building

In [None]:
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression(max_iter = 300)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

# Model Evaluation

In [None]:
from sklearn import metrics

# confusion matrix
confusion_matrix = metrics.plot_confusion_matrix(reg, X_test, y_test, cmap = "GnBu")
print(confusion_matrix)

In [None]:
# accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# ROC curve and AUC
y_pred_proba = reg.predict_proba(X_test)[:,1]
fpr, tpr, threshold = metrics.roc_curve(y_test,  y_pred_proba)
plt.plot(fpr,tpr)

auc = metrics.roc_auc_score(y_test, y_pred_proba)
print("AUC:", round(auc,2))