In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

In [None]:
df = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv", encoding="ISO-8859-1")

In [None]:
df

# EDA

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.nunique()

In [None]:
df.info()

In [None]:
df.isna().sum() 

# Feature Selection:

In [None]:
df.columns

In [None]:
df = df[['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation',
       'Sunshine', 'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday', 'RainTomorrow']]


# Data Cleaning

In [None]:
df = df.fillna(df.mean())
df = df.dropna()
df.drop_duplicates(inplace = True)

In [None]:
print(df.isna().sum())
print(df.shape)

In [None]:
df.describe()

# What is the average rainfall in Cairns ?

In [None]:
n1 = df[(df["Location"]=="Cairns")]
print("Average rainfall in Cairns: {:.2f}".format(np.mean(n1["Rainfall"])))

# Which place has the rainfall above 200 cm?

In [None]:
n2 = df.sort_values("Rainfall")
n3 = n2[n2["Rainfall"]>200]
print(n3[["Location","Rainfall"]].to_string(index=False))

In [None]:
lae=LabelEncoder()
df["Location"] = lae.fit_transform(df["Location"].astype(str))
df["WindGustDir"] = lae.fit_transform(df["WindGustDir"].astype(str))
df["WindDir9am"] = lae.fit_transform(df["WindDir9am"].astype(str))
df["WindDir3pm"] = lae.fit_transform(df["WindDir3pm"].astype(str))
df["RainToday"] = lae.fit_transform(df["RainToday"].astype(str))
df["RainTomorrow"] = lae.fit_transform(df["RainTomorrow"].astype(str))

In [None]:
df.describe().columns

In [None]:
_ = df.boxplot(column=['MaxTemp'])

In [None]:
df = df[(df["MaxTemp"] > 2) & (df["MaxTemp"]  < 43)]
df.shape

In [None]:
_ = df.boxplot(column=['Rainfall'])

In [None]:
df = df[df["Rainfall"]<5]
df.shape

In [None]:
_ = df.boxplot(column=['Evaporation'])

In [None]:
df = df[df["Evaporation"]<8]
df.shape

In [None]:
_ = df.boxplot(column=['WindGustSpeed'])

In [None]:
df = df[df["WindGustSpeed"]<78]
df.shape

In [None]:
_ = df.boxplot(column=['WindSpeed9am'])

In [None]:
df = df[df["WindSpeed9am"]<41]
df.shape

In [None]:
_ = df.boxplot(column=['WindSpeed3pm'])

In [None]:
df = df[df["WindSpeed3pm"]<41]
df.shape

In [None]:
_ = df.boxplot(column=['Pressure9am'])

In [None]:
df = df[(df["Pressure9am"]>1003) & (df["Pressure9am"]<1035)]
df.shape

In [None]:
_ = df.boxplot(column=['Temp9am'])

In [None]:
df = df[df["Temp9am"]<34]
df.shape

In [None]:
df.describe()

In [None]:
corelation = df.corr()
plt.subplots(figsize=(20,15))
_ = sns.heatmap(corelation,xticklabels=corelation.columns, yticklabels=corelation.columns,annot=True)

In [None]:
_ = sns.relplot(x="MinTemp",y="MaxTemp",hue="Rainfall",palette="viridis",data=df,s=100)

In [None]:
_ = sns.relplot(x="Sunshine",y="Evaporation",hue="Rainfall",palette="viridis",data=df,s=200)

In [None]:
X=df[['Location', 'MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'Sunshine',
       'WindGustDir', 'WindGustSpeed', 'WindDir9am', 'WindDir3pm',
       'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am',
       'Temp3pm', 'RainToday']].values
Y=df[['RainTomorrow']].values

In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(X,Y)

# Logistic Regression

In [None]:
lr=LogisticRegression()
lr.fit(X_train,Y_train)
Y_pred1=lr.predict(X_test)
accuracy_score(Y_test,Y_pred1)

# K-Nearest Neighbors (KNN)

In [None]:
knn=KNeighborsClassifier(n_neighbors=8)
knn.fit(X_train,Y_train)
Y_pred2=knn.predict(X_test)
accuracy_score(Y_test,Y_pred2)

# Support Vector Machine (SVM)

In [None]:
svc=SVC()
svc.fit(X_train,Y_train)
Y_pred3=svc.predict(X_test)
accuracy_score(Y_test,Y_pred3)