## If you find this kernel helpful, Please UPVOTES.

## Problem Definition
- Given weather parameters about Australia, can we predict whether or not they have raintomorrow?

## We will use the following CLASSIFICATION METHODS for Prediction
- Logistic regression
- Naive Bayes
- K-Nearest Neighbor (KNN)
- Support Vector Mechanism (SVM) Get predictions using Machine Learning models and compare these scores.

## Data contains;
- Date 
- Location : Cities of Australia
- MinTemp 
- MaxTemp
- Rainfall
- Evaporation
- Sunshine
- WindGustDir : Wind Directions (East:E, West:W, North:N, South:S etc.)
- WindGustSpeed
- WindDir9am : Wind Directions (East:E, West:W, North:N, South:S etc.)
- WindDir3pm : Wind Directions (East:E, West:W, North:N, South:S etc.)
- WindSpeed9am
- WindSpeed3pm
- Humidity9am
- Humidity3pm
- Pressure9am
- Pressure3pm
- Cloud9am
- Cloud3pm
- Temp9am
- Temp3pm
- RainToday : 'No' 'Yes'
- RainTomorrow : 'No' 'Yes'

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# For data visualization
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns; sns.set()
# Plotly for interactive graphics 
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

import missingno as msno
import warnings
warnings.filterwarnings("ignore")

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## DATA READING AND EXPLORING

In [None]:
data=pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")
df=data.copy().sample(10000)
df

In [None]:
df.info()

In [None]:
pd.DataFrame({"No. of unique values": list(df.nunique())}, index=df.columns)

In [None]:
for i in df.select_dtypes(include='object'):   
    print(i,'-->',df[i].unique())

In [None]:
y = len(df[df['RainToday'] == 'Yes'])
n = len(df[df['RainToday'] == 'No'])
print(y,n)

In [None]:
df.duplicated().sum()

In [None]:
df.describe().T 

In [None]:
print(df.shape)
print(df.ndim)
print(df.size)

In [None]:
df.corr() #I can see from here which variables I can put into the model
          #For example, the variables can be removed by looking at the order of importance.

## SOME OF VISUALIZATION

In [None]:
plt.figure(figsize = (16,6)) 
sns.heatmap(df.corr(),robust=True,fmt='.1g',linewidths=1.3,linecolor = 'gold', annot=True,);

In [None]:
#VISUALIZATION OF NAN  VALUES
msno.matrix(df)

In [None]:
#drop missing values in the RainToday and RainTomorrow
df.dropna(subset=['RainToday', 'RainTomorrow'],axis=0,inplace=True)

In [None]:
sns.countplot(x="RainToday",data=df)

## Multivariate imputation
- In multivariate imputation, we use ML Algorithms and before that we need to encode the categorical variables. 

In [None]:
def summary(df):
    Types = df.dtypes
    Counts = df.apply(lambda x: x.count())
    Min = df.min()
    Max = df.max()
    Uniques = df.apply(lambda x: x.unique().shape[0]) # .shape[0] yazilmaz ise unique olan degerlerin listelerini getirir.
    Nulls = df.apply(lambda x: x.isnull().sum())

    cols = ['Types', 'Counts', 'Uniques', 'Nulls', 'Min', 'Max']
    str = pd.concat([Types, Counts, Uniques, Nulls, Min, Max], axis = 1, sort=True)

    str.columns = cols
    display(str.sort_values(by='Nulls', ascending=False))
    print('__________Data Types__________\n')
    print(str.Types.value_counts())
summary(df)

In [None]:
#label encoding for univariate variables
from sklearn.preprocessing import LabelEncoder


lbe = LabelEncoder()
df["RainToday_label"] = lbe.fit_transform(df["RainToday"])
df["RainTomorrow_label"] = lbe.fit_transform(df["RainTomorrow"])

In [None]:
# one-hot encoding for variables with more than 2 categories

#drop variables with so many countries for the sake of time and memory consumption
df.drop(['Location','WindDir9am','WindDir3pm'], axis=1, inplace=True) 

df = pd.get_dummies(df, drop_first=True, columns = ['WindGustDir'], prefix = ['WindGustDir'])

In [None]:
def summary(df):
    Types = df.dtypes
    Counts = df.apply(lambda x: x.count())
    Min = df.min()
    Max = df.max()
    Uniques = df.apply(lambda x: x.unique().shape[0]) # .shape[0] yazilmaz ise unique olan degerlerin listelerini getirir.
    Nulls = df.apply(lambda x: x.isnull().sum())

    cols = ['Types', 'Counts', 'Uniques', 'Nulls', 'Min', 'Max']
    str = pd.concat([Types, Counts, Uniques, Nulls, Min, Max], axis = 1, sort=True)

    str.columns = cols
    display(str.sort_values(by='Nulls', ascending=False))
    print('__________Data Types__________\n')
    print(str.Types.value_counts())
summary(df)

In [None]:
# DecisionTreeRegressor

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor

#drop unnecessary columns and date columns
df_imputation = df.drop(['Date','RainToday','RainTomorrow'], axis=1) 

#define variables to keep the index and the columns
index = df_imputation.index
columns = df_imputation.columns

#imputation steps
imp_tree = IterativeImputer(random_state=0, estimator=DecisionTreeRegressor())
imp_tree.fit(df_imputation)
df_imputed = imp_tree.transform(df_imputation)

#transform imputed data in array format to dataframe
df_imputed_tree = pd.DataFrame(df_imputed, index=index, columns=columns)

df_imputed_tree.isnull().sum()

In [None]:
df_imputed_tree.info()

In [None]:
df_imputed_tree.isnull().sum()

## PREDICTION WITH CLASSIFICATION METHODS
### Preparation dependent and independent variables

In [None]:
df2 = df_imputed_tree.copy()
x_dat = df2.drop(['RainTomorrow_label'],axis=1)
y = df2["RainTomorrow_label"].values

In [None]:
summary(df2)

### Normalization of variabales

In [None]:
#If there is a outlier values, it must be done before coming here
x=(x_dat-np.min(x_dat))/(np.max(x_dat)-np.min(x_dat)).values

### Test-Train splitting

In [None]:
from sklearn.model_selection import train_test_split,cross_val_score,ShuffleSplit,GridSearchCV
from sklearn.metrics import accuracy_score,mean_squared_error,roc_curve,roc_auc_score,classification_report,r2_score,confusion_matrix

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=42) 

# 1) Modeling of Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lr_model=LogisticRegression() #default olanlar gelir.C var..
lr_model.fit(x_train,y_train)

In [None]:
print(lr_model.intercept_)
print(lr_model.coef_)#Bu katsayilar denklemin katsayilari(ax+b.. gibi). mesela (-) olanlar ters yonde etkiliyor.

## Prediction of Logistic Regression

In [None]:
y_pred=lr_model.predict(x_test)

In [None]:
y_pred[0:10]#tahminlerin ilk 10 degerini gorduk

## Accuracy Test(for default) of Logistic regression

In [None]:
LR = accuracy_score(y_test,y_pred)
LR

## Proba values - probability

In [None]:
y_probs = lr_model.predict_proba(x_test)[:,1]
y_probs

In [None]:
y_pred = [1 if i >0.70 else 0 for i in y_probs]
y_pred[:10]

In [None]:
log_score = accuracy_score(y_test,y_pred)
print ("log score=",log_score)

In [None]:
confusion_matrix(y_test,y_pred)

## Model tuning of Logistic regression

In [None]:
lr_model = LogisticRegression(solver = "liblinear")
lr_model= lr_model.fit(x_train,y_train)
lr_model

In [None]:
accuracy_score(y_test, lr_model.predict(x_test))

In [None]:
#Cross validation (10 katli ) yaparsak
lr_finalscore=cross_val_score(lr_model, x_test, y_test, cv = 10).mean()
lr_finalscore

# 2) Modeling of Gaussian Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb_model = GaussianNB()
nb_model = nb_model.fit(x_train, y_train)
nb_model

## Prediction of NB

In [None]:
y_pred = nb_model.predict(x_test)

## Accuracy score of NB

In [None]:
NB = accuracy_score(y_test,y_pred)
NB

In [None]:
confusion_matrix(y_test,y_pred)

## Model Tuning of NB

In [None]:
# 10 katli cross validation.
nb_finalscore=cross_val_score(nb_model, x_test, y_test, cv = 10).mean()
nb_finalscore

# 3) Modeling of KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn_model = KNeighborsClassifier()
knn_model = knn_model.fit(x_train, y_train)
?knn_model

## Prediction of KNN

In [None]:
y_pred = knn_model.predict(x_test)

## Accuracy score of KNN

In [None]:
KNN = accuracy_score(y_test, y_pred)
KNN

In [None]:
confusion_matrix(y_test,y_pred)

## Model Tuning of KNN

In [None]:
knn_params = {"n_neighbors": np.arange(1,50)}

In [None]:
knn = KNeighborsClassifier()
knn_cv = GridSearchCV(knn, knn_params, cv=10)
knn_cv.fit(x_train, y_train)

In [None]:
print("The best score:" + str(knn_cv.best_score_))
print("The best parameters: " + str(knn_cv.best_params_))

In [None]:
knn = KNeighborsClassifier(3)  #we choose 3 neigboors. I think 1 is not good 
knn_tuned = knn.fit(x_train, y_train)

In [None]:
knn_finalscore=knn_tuned.score(x_test, y_test)
knn_finalscore

# 4) Modeling of SVC

In [None]:
from sklearn.svm import SVC

svm_model = SVC().fit(x_train,y_train)#we choose default c:1,kernel:'rbf',dagree:3...
#?svm_model

## Prediction of SVC

In [None]:
y_pred = svm_model.predict(x_test)

## Accuracy score of SVC

In [None]:
SVC = accuracy_score(y_test,y_pred)
SVC

## Model Tuning of SVC
- For kernel : rbf

In [None]:
svc_params = {"C": np.arange(1,10)}

svc = SVC(kernel = "rbf")

svc_cv_model = GridSearchCV(svc,svc_params, 
                            cv = 10, 
                            n_jobs = -1,        
                            verbose = 2 )

svc_cv_model.fit(x_train, y_train)

In [None]:
print("The best parameters: " + str(svc_cv_model.best_params_))

In [None]:
svc_tuned1 = SVC(kernel = "rbf", C = 8).fit(x_train, y_train)
y_pred = svc_tuned1.predict(x_test)
accuracy_score(y_test, y_pred)

- For kernel : linear

In [None]:
svc_params = {"C": np.arange(1,10)}

svc = SVC(kernel = "linear")

svc_cv_model = GridSearchCV(svc,svc_params, 
                            cv = 10, 
                            n_jobs = -1, 
                            verbose = 2 )

svc_cv_model.fit(x_train, y_train)

In [None]:
print("The best parameters: " + str(svc_cv_model.best_params_))

In [None]:
svc_tuned2 = SVC(kernel = "linear", C = 6).fit(x_train, y_train)
y_pred = svc_tuned2.predict(x_test)
accuracy_score(y_test, y_pred)

- For kernel:rbf , C and gamma

In [None]:
svc_model = SVC(kernel = "rbf").fit(x_train, y_train)

In [None]:
svc_params = {"C": [0.0001, 0.001, 0.1, 1, 5, 10 ,50 ,100],
             "gamma": [0.0001, 0.001, 0.1, 1, 5, 10 ,50 ,100]}

In [None]:
svc = SVC()
svc_cv_model = GridSearchCV(svc, svc_params, 
                         cv = 10, 
                         n_jobs = -1,
                         verbose = 2)

svc_cv_model.fit(x_train, y_train)

In [None]:
print("The best parameters: " + str(svc_cv_model.best_params_))

In [None]:
svc_tuned3 = SVC(C = 5, gamma =50).fit(x_train, y_train)
y_pred = svc_tuned3.predict(x_test)
svc_finalscore=accuracy_score(y_test, y_pred)
svc_finalscore

In [None]:
indexx = ["Log","NB","KNN","SVC"]
regressions = [LR,NB,KNN,SVC]

plt.figure(figsize=(8,6))
sns.barplot(x=indexx,y=regressions)

plt.xticks()
plt.title('Model Comparision',color = 'orange',fontsize=20);