In [None]:
import numpy as np
import pandas as pd
import missingno as mn
import matplotlib.pyplot as plt
import matplotlib as mpl
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

## Reading and basic data info

In [None]:
data = pd.read_csv('../input/weather-dataset-rattle-package/weatherAUS.csv')
data.head(10)

In [None]:
data.info()

## Checking null values in table columns using missingno

In [None]:
mn.matrix(data)
plt.show()

In [None]:
def scat(pos, x, y, title):
    plt.subplot(8,2,pos)
    plt.scatter(y,x,s=5,alpha=0.3)
    plt.title(title)

## Plotting current day rainfall vs all the features to know which features to select

In [None]:
mpl.rcParams['figure.dpi'] = 300
plt.rcParams.update({'font.size': 5})
plt.figure(figsize=(8, 16))
to_plot = data.columns.values.tolist()[2:-2]
for val in ['WindGustDir', 'WindDir9am', 'WindDir3pm']:
    to_plot.remove(val)
i = 1
for y in to_plot:
    scat(i, data['Rainfall'], data[y], f'Rainfall(Y) vs {y}')
    i+=1
plt.tight_layout()
plt.show()

In [None]:
useful_data = ['Location','MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Temp9am', 'Temp3pm', 'RainTomorrow']

In [None]:
chosen_data = data[useful_data]
chosen_data.head(10)

## Checking and dropping null values from the table

In [None]:
print(chosen_data.isna().sum())
len(chosen_data)

In [None]:
chosen_data = chosen_data.dropna()
chosen_data.info()

## Converting location and RainTomorrow data to classes instead of text using labelencoder

In [None]:
le = preprocessing.LabelEncoder()
chosen_data['Location'] = le.fit_transform(chosen_data['Location'].values)
chosen_data['RainTomorrow'] = le.fit_transform(chosen_data['RainTomorrow'].values)
chosen_data

In [None]:
X = chosen_data.iloc[:, :-1]
y = chosen_data.iloc[:,-1]
Xn = np.ones((X.shape[0],X.shape[1]+1))
Xn[:,1:] = X
X = Xn

## feature normalization for gradient decent

In [None]:
mu = np.mean(X, axis = 0)
sigma = np.std(X, axis = 0)
X[:,1:] = (X[:,1:]-mu[1])/sigma[1]

In [None]:
def sigmoid(x):
    return 1/(1+np.exp(-x))

def hypothesis(theta, x):
    return sigmoid(np.dot(x, theta))

In [None]:
alpha = 0.01
theta = theta = np.zeros((X.shape[1],1))
iterations = 3000

## Splitting into test and train datasets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
m = len(X_train)
y_train = y_train.values.reshape((y_train.shape[0],1))
y_test = y_test.values.reshape((y_test.shape[0],1))

## Running gradient descent manually

In [None]:
cost = np.zeros((iterations, 1))
for i in range(iterations):
    theta = theta - (alpha*np.dot(X_train.T, (hypothesis(theta, X_train) - y_train)))/m
theta

## checking accuracy

In [None]:
y_pred = hypothesis(theta, X_test)
total = len(y_pred)
y_pred[y_pred<0.5] = 0
y_pred[y_pred>=0.5] = 1
accurate = (y_pred == y_test).sum()
accuracy = accurate/total
accuracy*100

In [None]:
pd.DataFrame(X_train)

## Logistic regression using sklearn

In [None]:
from sklearn.linear_model import LogisticRegression
logisticRegr = LogisticRegression(max_iter=3000)
logisticRegr.fit(X_train, y_train.ravel())
predictions = logisticRegr.predict(X_test)
score = logisticRegr.score(X_test, y_test)
print(score)