In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.gridspec as gridspec
pd.set_option('display.max_columns', 24)

In [None]:
wh = pd.read_csv("data/wh.csv")
wh.head(10)

In [None]:
wh.isnull().sum()

In [None]:
wh.describe().append(wh.isnull().sum().rename('isnull'))

In [None]:
fig = plt.figure(constrained_layout=True, figsize=(12, 8))
grid = gridspec.GridSpec(ncols=3, nrows=4, figure=fig)
# Histrogram
ax1 = fig.add_subplot(grid[0, :])
sns.distplot(wh["sale_price"], ax=ax1)
ax1.set_title("Histrogram of sale_price")
# QQplot
ax2 = fig.add_subplot(grid[2:, :2])
stats.probplot(br_df["sale_price"], plot=ax2)
ax2.set_title("QQplot of sale_price")
# Boxplot
ax3 = fig.add_subplot(grid[2:, 2])
sns.boxplot(br_df["sale_price"], ax=ax3, orient="v")
ax3.set_title("Boxplot of sale_price")
plt.show()

In [None]:
wh.hist()

In [None]:
wh['Month'] = pd.to_datetime(wh['Date']).dt.month

# We check the target distribution across our new feature
sns.countplot(x = 'Month', hue =  'RainTomorrow', orient = 'h', data = wh)

In [None]:
fig, ax = plt.subplots(figsize=(10, 6))
ax.scatter(x = wh['Gr Liv Area'], y = wh['SalePrice'])
plt.xlabel("Living Area Above Ground")
plt.ylabel("House Price")

plt.show()

In [None]:
wh['MinTemp'] = wh['MinTemp'].fillna(wh['MinTemp'].median())
wh['MaxTemp'] = wh['MaxTemp'].fillna(wh['MaxTemp'].median())
wh['Rainfall'] = wh['Rainfall'].fillna(wh['Rainfall'].median())
wh['Evaporation'] = wh['Evaporation'].fillna(wh['Evaporation'].median())
wh['Sunshine'] = wh['Sunshine'].fillna(wh['Sunshine'].median())
wh['WindGustSpeed'] = wh['WindGustSpeed'].fillna(wh['WindGustSpeed'].median())
wh['WindSpeed9am'] = wh['WindSpeed9am'].fillna(wh['WindSpeed9am'].median())
wh['WindSpeed3pm'] = wh['WindSpeed3pm'].fillna(wh['WindSpeed3pm'].median())
wh['Humidity9am'] = wh['Humidity9am'].fillna(wh['Humidity9am'].median())
wh['Humidity3pm'] = wh['Humidity3pm'].fillna(wh['Humidity3pm'].median())
wh['Pressure9am'] = wh['Pressure9am'].fillna(wh['Pressure9am'].median())
wh['Pressure3pm'] = wh['Pressure3pm'].fillna(wh['Pressure3pm'].median())
wh['Cloud9am'] = wh['Cloud9am'].fillna(wh['Cloud9am'].median())
wh['Cloud3pm'] = wh['Cloud3pm'].fillna(wh['Cloud3pm'].median())
wh['Temp9am'] = wh['Temp9am'].fillna(wh['Temp9am'].median())
wh['Temp3pm'] = wh['Temp3pm'].fillna(wh['Temp3pm'].median())
wh['RainToday'] = wh['RainToday'].fillna(wh['RainToday'].mode()[0])
wh['WindDir3pm'] = wh['WindDir3pm'].fillna(wh['WindDir3pm'].mode()[0])
wh['WindDir9am'] = wh['WindDir9am'].fillna(wh['WindDir9am'].mode()[0])
wh['WindGustDir'] = wh['WindGustDir'].fillna(wh['WindGustDir'].mode()[0])

In [None]:
wh.isnull().sum()

In [None]:
wh['Date'] = wh['Date'].astype('category')
wh['Date'] = wh['Date'].cat.codes
wh['Date'].unique()

In [None]:
wh['Location'] = wh['Location'].astype('category')
wh['Location'] = wh['Location'].cat.codes
wh['Location'].unique()

In [None]:
mapping = {'No' : 0, 'Yes' : 1}
wh['RainToday'] = wh['RainToday'].map(mapping)
wh['RainToday']

In [None]:
#wh['RainToday'].astype('bool')
#mapping = {False : 0, True : 1}
#wh['RainToday'] = wh['RainToday'].map(mapping)
#wh['RainToday']

In [None]:
wh['RainTomorrow'] = wh['RainTomorrow'].astype('category')
wh['RainTomorrow'] = wh['RainTomorrow'].cat.codes
wh['RainTomorrow'].unique()

In [None]:
wh['WindDir3pm'] = wh['WindDir3pm'].astype('category')
wh['WindDir3pm'] = wh['WindDir3pm'].cat.codes
wh['WindDir3pm'].unique()

In [None]:
wh['WindDir9am'] = wh['WindDir9am'].astype('category')
wh['WindDir9am'] = wh['WindDir9am'].cat.codes
wh['WindDir9am'].unique()

In [None]:
wh['WindGustDir'] = wh['WindGustDir'].astype('category')
wh['WindGustDir'] = wh['WindGustDir'].cat.codes
wh['WindGustDir'].unique()

In [None]:
wh.head(283)

In [None]:
x = wh.drop(['RainTomorrow'], axis=1).values
Y = wh['RainTomorrow'].values

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

X_train, X_test, y_train, y_test = train_test_split(x, Y, test_size=0.20)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()
clf.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn import tree

clfT = tree.DecisionTreeClassifier()
clfT = clfT.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

y_pred = clfT.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn.naive_bayes import GaussianNB

clf = GaussianNB()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
from sklearn import svm
clf = svm.SVC()
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
wh = wh.drop(['Date', 'Location'], axis=1)
sns.pairplot(wh, vars = wh.columns[:4], hue="RainTomorrow")
plt.show()