# **Importing Libraries and Data**

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import SelectPercentile
from sklearn.feature_selection import chi2 , f_classif 
from sklearn.model_selection import train_test_split


In [None]:
df = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')

# **Data Exploration**

In [None]:
df.columns.values

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

# **View our target variable RainTomorrow**

In [None]:
df["RainTomorrow"].isnull().sum()

In [None]:
df["RainTomorrow"].value_counts()

# **Dropping rows with any empty cell in RainTomorrow**

In [None]:
df.dropna(subset=['RainTomorrow'], inplace=True)

In [None]:
df["RainTomorrow"].isnull().sum()

# **Feature Engineering of Date variable.**

In [None]:
df['Date']=pd.to_datetime(df['Date'],format='%Y-%m-%d')


In [None]:
df['Year']=df['Date'].dt.year
df['Month']=df['Date'].dt.month
df['day']=df['Date'].dt.day


In [None]:
df.drop('Date', axis = 1, inplace = True)


In [None]:
df.head()

# **Number of Nan values in each column**

In [None]:
msno.bar(df)

# **Getting rid of the columns with nulls more than 10% which will not be used in our model**


In [None]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])


In [None]:

List = list(missing_data[missing_data['Percent'] > 0.10].index)
df.drop(List, axis=1, inplace=True)

In [None]:
df.shape

In [None]:

df.head(5)

# **Cleaning Data**

In [None]:
df_object = df.select_dtypes(include="O")
df_number = df.select_dtypes(exclude="O")


In [None]:
df_object.isnull().sum().sort_values(ascending=False)

In [None]:
for col in df_object.columns:
    mode = df_object[col].mode()[0]
    df_object[col].fillna(mode, inplace = True)

In [None]:
df_object.isnull().sum().sort_values(ascending=False)

In [None]:
df_number.isnull().sum().sort_values(ascending=False)

In [None]:
for col in df_number.columns:
    mean = df_number[col].mean()
    df_number[col].fillna(mean, inplace = True)

In [None]:
df_number.isnull().sum().sort_values(ascending=False)

In [None]:
df.head(5)

In [None]:
df_object

# **Data Transformations**

In [None]:
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df_object = df_object.astype(str).apply(label.fit_transform)
df_object

In [None]:
data = pd.concat([df_object, df_number], axis = 1)


In [None]:
data.head(5)

In [None]:
df.columns

In [None]:

cols = ['MinTemp', 'MaxTemp', 'Temp9am', 'Temp3pm', 'WindGustSpeed', 'WindSpeed3pm', 'Pressure9am', 'Pressure3pm']
sns.pairplot(df[cols], diag_kind='hist', kind='scatter')

# **Splitting Data to X,y**

In [None]:
X = data.drop(['RainTomorrow'], axis=1)
y = data.RainTomorrow


# **Correlation Heatmap**


In [None]:
plt.figure(figsize=(15,10))
sns.heatmap(X.corr(), linecolor='black', linewidth=1, annot=True)

# **Feature Selection**

In [None]:
print('Original X Shape is ' , X.shape)
FeatureSelection = SelectPercentile(score_func = f_classif, percentile=50) # score_func can = f_classif
X = FeatureSelection.fit_transform(X, y)

#showing X Dimension 
print('X Shape is ' , X.shape)
print('Selected Features are : ' , FeatureSelection.get_support())


# **MinMaxScaler for Data**

In [None]:
scaler = MinMaxScaler(copy=True, feature_range=(0, 1))
X = scaler.fit_transform(X)

#showing data
print('X \n' , X[:3])

# **Splitting data**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                   test_size=0.33, random_state=44, shuffle =True)


# **Applying LogisticRegression Model**

In [None]:

from sklearn.linear_model import LogisticRegression

LogisticRegressionModel = LogisticRegression(penalty='l2',solver='sag',C=1.0,random_state=33)
LogisticRegressionModel.fit(X_train, y_train)

#Calculating Details
print('LogisticRegressionModel Train Score is : ' , LogisticRegressionModel.score(X_train, y_train))
print('LogisticRegressionModel Test Score is : ' , LogisticRegressionModel.score(X_test, y_test))

#print('----------------------------------------------------')

#Calculating Prediction
y_pred = LogisticRegressionModel.predict(X_test)
y_pred_prob = LogisticRegressionModel.predict_proba(X_test)
print('Predicted Value for LogisticRegressionModel is : ' , y_pred[:10])
print('Prediction Probabilities Value for LogisticRegressionModel is : ' , y_pred_prob[:10])


# **Applying SGDClassifier Model**

In [None]:
from sklearn.linear_model import SGDClassifier

SGDClassifierModel = SGDClassifier(penalty='l2',loss='squared_loss',learning_rate='optimal',random_state=33)
SGDClassifierModel.fit(X_train, y_train)

#Calculating Details
print('SGDClassifierModel Train Score is : ' , SGDClassifierModel.score(X_train, y_train))
print('SGDClassifierModel Test Score is : ' , SGDClassifierModel.score(X_test, y_test))

#Calculating Prediction
y_pred = SGDClassifierModel.predict(X_test)
print('Predicted Value for SGDClassifierModel is : ' , y_pred[:10])
print('Predicted Value for SGDClassifierModel is : ' , y_pred_prob[:10])

# **Applying SVC Model**

In [None]:
from sklearn.svm import SVC

SVCModel = SVC(kernel= 'sigmoid',# it can be also linear,poly,sigmoid,precomputed
               max_iter=10000,C=100,gamma='auto')
SVCModel.fit(X_train, y_train)

#Calculating Details
print('SVCModel Train Score is : ' , SVCModel.score(X_train, y_train))
print('SVCModel Test Score is : ' , SVCModel.score(X_test, y_test))
#print('----------------------------------------------------')

#Calculating Prediction
y_pred = SVCModel.predict(X_test)
print('Predicted Value for SVCModel is : ' , y_pred[:10])
print('Prediction Probabilities Value for LogisticRegressionModel is : ' , y_pred_prob[:10])


# **Applying QDA Model** 

In [None]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis


QDAModel = QuadraticDiscriminantAnalysis(tol=0.0001)
QDAModel.fit(X_train, y_train)

#Calculating Details
print('QDAModel Train Score is : ' , QDAModel.score(X_train, y_train))
print('QDAModel Test Score is : ' , QDAModel.score(X_test, y_test))
print('QDAModel means are : ' , QDAModel.means_)

#Calculating Prediction
y_pred = QDAModel.predict(X_test)
y_pred_prob = QDAModel.predict_proba(X_test)
print('Predicted Value for QDAModel is : ' , y_pred[:10])
print('Prediction Probabilities Value for QDAModel is : ' , y_pred_prob[:10])

# **Applying MultinomialNB Model**

In [None]:
from sklearn.naive_bayes import MultinomialNB


MultinomialNBModel = MultinomialNB(alpha=1.0)
MultinomialNBModel.fit(X_train, y_train)

#Calculating Details
print('MultinomialNBModel Train Score is : ' , MultinomialNBModel.score(X_train, y_train))
print('MultinomialNBModel Test Score is : ' , MultinomialNBModel.score(X_test, y_test))

#Calculating Prediction
y_pred = MultinomialNBModel.predict(X_test)
y_pred_prob = MultinomialNBModel.predict_proba(X_test)
print('Predicted Value for MultinomialNBModel is : ' , y_pred[:10])
print('Prediction Probabilities Value for MultinomialNBModel is : ' , y_pred_prob[:10])

# **Applying DecisionTreeClassifier Model**

In [None]:
#Import Libraries
from sklearn.tree import DecisionTreeClassifier


DecisionTreeClassifierModel = DecisionTreeClassifier(criterion='gini',max_depth=3,random_state=33) #criterion can be entropy
DecisionTreeClassifierModel.fit(X_train, y_train)

#Calculating Details
print('DecisionTreeClassifierModel Train Score is : ' , DecisionTreeClassifierModel.score(X_train, y_train))
print('DecisionTreeClassifierModel Test Score is : ' , DecisionTreeClassifierModel.score(X_test, y_test))

#Calculating Prediction
y_pred = DecisionTreeClassifierModel.predict(X_test)
y_pred_prob = DecisionTreeClassifierModel.predict_proba(X_test)
print('Predicted Value for DecisionTreeClassifierModel is : ' , y_pred[:10])
print('Prediction Probabilities Value for DecisionTreeClassifierModel is : ' , y_pred_prob[:10])