In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
data = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
data.head(10)

In [None]:
data.columns

In [None]:
data.describe()

In [None]:
data = data.drop(['Date'], axis=1)

In [None]:
data.head()

In [None]:
data.info()

In [None]:
data.RainToday

In [None]:
data['RainToday']

In [None]:
numerical_features = ['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation','Sunshine','WindGustSpeed', 
        'WindSpeed9am', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm',
       'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am','Temp3pm']

In [None]:
len(numerical_features)

In [None]:
numeric_data = data[numerical_features]
numeric_data.info()

In [None]:
numeric_data

In [None]:
numeric_data['MinTemp'] = numeric_data['MinTemp'].fillna(12.186)

In [None]:
sum(numeric_data['MinTemp'])/len(numeric_data['MinTemp'])

In [None]:
np.mean(numeric_data['MinTemp'])

In [None]:
import seaborn as sns
from matplotlib.pyplot import plot as plt

In [None]:
numeric_data['MinTemp'].plot(figsize=(20,12)).line()

In [None]:
numeric_data['MinTemp'].values.sort()

In [None]:
sns.distplot(numeric_data['MinTemp'])

In [None]:
for i in numerical_features:
    numeric_data[i] = numeric_data[i].fillna(np.nanmean(numeric_data[i]))

In [None]:
list(enumerate(numerical_features))

In [None]:
import matplotlib

fig, axes = matplotlib.pyplot.subplots(16, figsize=(12,64))

for ix,el in enumerate(numerical_features):
    sns.distplot(numeric_data[el], ax=axes[ix])

In [None]:
numeric_data['Cloud3pm'].value_counts().plot.bar()

In [None]:
data.head()

In [None]:
y = data['RainTomorrow']

In [None]:
y.head()

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logres = LogisticRegression()

In [None]:
logres.fit(X=numeric_data, y=y)

In [None]:
logres.coef_

In [None]:
for ix, el in enumerate(numerical_features):
    print(el, logres.coef_[0][ix])

In [None]:
y_pred = logres.predict(numeric_data)

In [None]:
y.values

In [None]:
y_pred

In [None]:
assert len(y.values) == len(y_pred)

In [None]:
errors = 0
for i in range(len(y_pred)):
    if y.values[i] != y_pred[i]:
        errors += 1

print((1-errors/len(y_pred))*100)

In [None]:
logres.score(numeric_data, y)

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
tree = DecisionTreeClassifier()

In [None]:
tree.fit(numeric_data, y)

In [None]:
for ix, el in enumerate(numerical_features):
    print(el, tree.feature_importances_[ix])

In [None]:
tree.score(numeric_data, y)

In [None]:
tree = DecisionTreeClassifier(max_depth=8)
tree.fit(numeric_data, y)
tree.score(numeric_data, y)

In [None]:
tree.predict(numeric_data)

In [None]:
tree_proba = tree.predict_proba(numeric_data)

In [None]:
logres_proba = logres.predict_proba(numeric_data)

In [None]:
logres_proba

In [None]:
mixed_proba = (tree_proba + logres_proba) / 2

In [None]:
y_pred_mixed = []
for i in mixed_proba:
    if i[0] > i[1]:
        y_pred_mixed.append('No')
    else:
        y_pred_mixed.append('Yes')

In [None]:
y_pred_mixed = ['No' if i[0] > i[1] else 'Yes' for i in mixed_proba]

In [None]:
y_pred_mixed = np.array(y_pred_mixed)

In [None]:
errors = 0
for i in range(len(y_pred_mixed)):
    if y.values[i] != y_pred_mixed[i]:
        errors += 1

print((1-errors/len(y_pred_mixed))*100)

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf = RandomForestClassifier(n_estimators=100)

In [None]:
rf.fit(numeric_data, y)

In [None]:
rf.score(numeric_data, y)

In [None]:
cat_features = ['Location', 'WindGustDir', 'WindDir9am', 'WindDir3pm']

In [None]:
cat_data = data[cat_features]

In [None]:
cat_data['Location'].value_counts()

In [None]:
sns.distplot(cat_data['Location'].value_counts())

In [None]:
locations = set(cat_data['Location'].values)

In [None]:
locations

In [None]:
cat_data['Adelaide'] = 0

In [None]:
cat_data['Adelaide'] = cat_data['Location'].apply(lambda x: int(x == 'Adelaide'))

In [None]:
cat_data[cat_data['Adelaide'] == 1]

In [None]:
for i in locations:
    cat_data[i] = cat_data['Location'].apply(lambda x: int(x == i))

In [None]:
cat_data

In [None]:
cat_data.drop(['Location'], axis=1, inplace=True)

In [None]:
WindGustDir_dummies = pd.get_dummies(cat_data['WindGustDir'], prefix='WindGustDir')

In [None]:
WindDir9am_dummies = pd.get_dummies(cat_data['WindDir9am'], prefix='WindDir9am')
WindDir3am_dummies = pd.get_dummies(cat_data['WindDir3pm'], prefix='WindDir3pm')

In [None]:
cat_data.drop(['WindGustDir','WindDir9am','WindDir3pm'], axis=1, inplace=True)
cat_data.head()

In [None]:
cat_data = pd.concat([cat_data,WindGustDir_dummies, WindDir9am_dummies, WindDir3am_dummies],axis=1)

In [None]:
cat_data.head()

In [None]:
X = pd.concat([numeric_data, cat_data], axis=1)
X.head()

In [None]:
X.info()

In [None]:
logres_full = LogisticRegression()
logres_full.fit(X, y)
logres_full.score(X, y)