In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_aus = pd.read_csv(os.path.join(dirname, filename))

In [None]:
df_aus.head()

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set()

# Basic Infos

In [None]:
def basic_info(data):
    print("Dataset shape is: ", data.shape)
    print("Dataset size is: ", data.size)
    print("Dataset columns are: ", data.columns)
    print("Dataset info is: ", data.info())
    categorical = []
    numerical = []
    for i in data.columns:
        if data[i].dtype == object:
            categorical.append(i)
        else:
            numerical.append(i)
    print("Categorical variables are:\n ", categorical)
    print("Numerical variables are:\n ", numerical)
    return categorical, numerical

In [None]:
categorical, numerical = basic_info(df_aus)

#### Lets make some changes which will help us further...

Dropping RISK_MM as specified and changing the datatype of Date from object to Datetime, but we will do it in a deep copy so that original will remain intact if we need if for any further assistance

In [None]:
df_aus2 = df_aus.copy(deep = True)

In [None]:
df_aus2 = df_aus.drop(['RISK_MM'], axis = 1)

In [None]:
df_aus2['Date'] = pd.to_datetime(df_aus2['Date'])

In [None]:
df_aus2.head()

In [None]:
categorical2, numerical2 = basic_info(df_aus2)

#### Let's see about Null/ NA values

In [None]:
df_aus2.isnull().sum()

We are not gonna remove these null values because this will lead to huge deprication if the whole dataset

# General Exploration

In [None]:
plt.figure(figsize = (30,8))
plt.plot(df_aus2['Date'][:1000], df_aus2['MinTemp'][:1000], color = "#DC143C", label = 'Minimum Temperature',)
plt.plot(df_aus2['Date'][:1000], df_aus2['MaxTemp'][:1000], color = "#104E8B", label = 'Maximum Temperature')
plt.fill_between(df_aus2['Date'][:1000], df_aus2['MinTemp'][:1000], df_aus2['MaxTemp'][:1000], facecolor = "#EEE685")
plt.legend()
plt.show()

In [None]:
plt.figure(figsize = (30,8))
plt.plot(df_aus2['Date'][1000:2000], df_aus2['MinTemp'][1000:2000], color = "#DC143C", label = 'Minimum Temperature',)
plt.plot(df_aus2['Date'][1000:2000], df_aus2['MaxTemp'][1000:2000], color = "#104E8B", label = 'Maximum Temperature')
plt.fill_between(df_aus2['Date'][1000:2000], df_aus2['MinTemp'][1000:2000], df_aus2['MaxTemp'][1000:2000], facecolor = "#EEE685")
plt.legend()
plt.show()

### Lets Explore all Categorical columns

In [None]:
categorical2

Lets see about Locations

In [None]:
df_aus2['Location'].value_counts()

In [None]:
plt.figure(figsize=(50, 8))
sns.countplot(df_aus2['Location'])
plt.xticks(rotation=-45)
plt.show()

Lets see about WindGustDir

In [None]:
df_aus2['WindGustDir'].value_counts()

In [None]:
plt.figure(figsize=(30, 8))
sns.countplot(df_aus2['WindGustDir'])
plt.xticks(rotation=-45)
plt.show()

Lets see aout 'WindDir9am'

In [None]:
df_aus2['WindDir9am'].value_counts()

In [None]:
plt.figure(figsize=(30, 8))
sns.countplot(df_aus2['WindDir9am'])
plt.xticks(rotation=-45)
plt.show()

We got the info that in N, wind direction at 9am is high

In [None]:
df_aus2['WindDir3pm'].value_counts()

In [None]:
plt.figure(figsize=(30, 8))
sns.countplot(df_aus2['WindDir3pm'])
plt.xticks(rotation=-45)
plt.show()

We see that at 3pm, most of the time the wind direction is in South East (SE)

## Lets Explore all the numerical values

In [None]:
numerical2

In [None]:
numerical3 = numerical2[:]

In [None]:
numerical3 = numerical3[1:]  #removing date

In [None]:
numerical3

In [None]:
numerical_hist = df_aus2[numerical3]

In [None]:
numerical_hist

In [None]:
numerical_hist.hist(figsize = [20,20], bins = 50)
plt.show()

# Answers to the questions

#### The most min temp

In [None]:
min(df_aus2['MinTemp'].value_counts().index)

#### The most max temp

In [None]:
max(df_aus2['MaxTemp'].value_counts().index)

#### Largest amount of rainfall

In [None]:
max(df_aus2['Rainfall'].value_counts().index)

# Model Making

For making the model, we do need to convert categorical values into integer/float values so as to work with them

We do have a list of columns which are categorical, we are gonna make a function, through which we will pass the each column, make dummy, add the dummy[:-1] to the dataset and when the whole process is done, we will return a new dataset

In [None]:
def making_new_df(data, columnlist):
    for i in columnlist:
        dummy = pd.get_dummies(data[i])
        #print(dummy)
        del dummy[dummy.columns[-1]]
        data = pd.concat([data, dummy], axis = 1)
    return data

In [None]:
df_aus3 = making_new_df(df_aus2, categorical2)

In [None]:
df_aus3

Now we will remove those columns which we won't be needing for model making... and those are Date and all the categorical variables, since these has already been converted to one-hot encoded form

In [None]:
df_aus3 = df_aus3.drop(['Date']+categorical, axis = 1)

In [None]:
df_aus3.head()

##### Here by using dummy, No = 1 and Yes = 0 has been made. Point to remeber

#### The models I will be making for this will be:
#### Logistic Regression model
#### Random Forest Classififer
#### Naive Bayes

In [None]:
df_aus3.isnull().sum()

We will be dealing with nan, later. Continue reading the notebook

## Lets split our model into training and testing set

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X = df_aus3.iloc[:, :-1]

In [None]:
X

In [None]:
Y = df_aus3.iloc[:, -1]

In [None]:
Y

In [None]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size = 0.20, random_state=42)

We know that we have many nan values in X which will transfer in train_x and test_x, so we will use SimpleImputer so as to fill those Nan values with mean

In [None]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

In [None]:
imputer.fit(train_x)
imputer.fit(test_x)

In [None]:
train_x = imputer.transform(train_x)
test_x = imputer.transform(test_x)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
LR = LogisticRegression(max_iter=5000)

In [None]:
model_lr = LR.fit(train_x, train_y)

In [None]:
y_lr_predict = model_lr.predict(test_x)

In [None]:
LR_df = pd.DataFrame(data = {"Actual": test_y, "Predicted": y_lr_predict})

In [None]:
LR_df

In [None]:
model_lr.score(test_x, test_y)

### Random Forest Classififer

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()

In [None]:
model_rfr = rfc.fit(train_x, train_y)
y_rfr_predict = model_rfr.predict(test_x)

In [None]:
RFR_df = pd.DataFrame(data = {"Actual": test_y, "Predicted": y_rfr_predict})

In [None]:
RFR_df

In [None]:
model_rfr.score(test_x, test_y)

### Let's go for Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [None]:
model_gnb = gnb.fit(train_x, train_y)
y_gnb_predict = model_gnb.predict(test_x)

In [None]:
GNB_df = pd.DataFrame(data = {"Actual":test_y, "Predicted": y_gnb_predict})

In [None]:
GNB_df

In [None]:
model_gnb.score(test_x, test_y)

The score of all the models are :

In [None]:
print("Logistic Regression Score: ", model_lr.score(test_x, test_y))
print("Random Forest Classifier Score: ", model_rfr.score(test_x, test_y))
print("Naive Bayes Score: ", model_gnb.score(test_x, test_y))

As for the 3 models I made, **Random Forest Classifier** had the best Score. So now this is the end of the notebook. Feel free to give an upvote. If there's any question, please feel free to ask. I will try to answer with the best of my knowledge. And if there's any wrong step or anything, please feel free to point out. I will learn from them and rectify them in this and the upcoming notebooks as well.