In [26]:
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [27]:
dataset = pd.read_csv('Dataset/weatherAUS.csv',nrows=4000)
X = dataset.iloc[:,[1,2,3,4,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21]].values
Y = dataset.iloc[:,-1].values
print(X)

[['Albury' 13.4 22.9 ... 16.9 21.8 'No']
 ['Albury' 7.4 25.1 ... 17.2 24.3 'No']
 ['Albury' 12.9 25.7 ... 21.0 23.2 'No']
 ...
 ['BadgerysCreek' 10.0 22.4 ... 17.7 20.7 'No']
 ['BadgerysCreek' 4.6 28.7 ... 14.5 25.3 'No']
 ['BadgerysCreek' 7.8 27.8 ... 18.7 27.4 'No']]


In [28]:
print(Y)

['No' 'No' 'No' ... 'No' 'No' 'No']


In [29]:
Y = Y.reshape(-1,1)
#Dealing with invalid Data
imputer = SimpleImputer(missing_values=np.nan,strategy='most_frequent')
X = imputer.fit_transform(X)
Y = imputer.fit_transform(Y)
print(X)

[['Albury' 13.4 22.9 ... 16.9 21.8 'No']
 ['Albury' 7.4 25.1 ... 17.2 24.3 'No']
 ['Albury' 12.9 25.7 ... 21.0 23.2 'No']
 ...
 ['BadgerysCreek' 10.0 22.4 ... 17.7 20.7 'No']
 ['BadgerysCreek' 4.6 28.7 ... 14.5 25.3 'No']
 ['BadgerysCreek' 7.8 27.8 ... 18.7 27.4 'No']]


In [30]:
#Encoding Dataset
le1 = LabelEncoder()
X[:,0] = le1.fit_transform(X[:,0])
le2 = LabelEncoder()
X[:,4] = le2.fit_transform(X[:,4])
le3 = LabelEncoder()
X[:,6] = le3.fit_transform(X[:,6])
le4 = LabelEncoder()
X[:,7] = le4.fit_transform(X[:,7])
le5 = LabelEncoder()
X[:,-1] = le5.fit_transform(X[:,-1])
le6 = LabelEncoder()
Y[:,-1] = le6.fit_transform(Y[:,-1])
print(X)

[[0 13.4 22.9 ... 16.9 21.8 0]
 [0 7.4 25.1 ... 17.2 24.3 0]
 [0 12.9 25.7 ... 21.0 23.2 0]
 ...
 [1 10.0 22.4 ... 17.7 20.7 0]
 [1 4.6 28.7 ... 14.5 25.3 0]
 [1 7.8 27.8 ... 18.7 27.4 0]]


In [31]:
print(Y)

[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [32]:
Y = np.array(Y,dtype=float)
print(Y)

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [0.]]


In [33]:
#Feature Scaling
sc = StandardScaler()
X = sc.fit_transform(X)
print(X)

[[-5.61951487e-01  5.87241678e-01  1.54909661e-03 ...  3.42299439e-01
   3.88963551e-02 -4.99609344e-01]
 [-5.61951487e-01 -4.05810961e-01  2.96615118e-01 ...  3.91129021e-01
   3.84764709e-01 -4.99609344e-01]
 [-5.61951487e-01  5.04487292e-01  3.77087669e-01 ...  1.00963706e+00
   2.32582633e-01 -4.99609344e-01]
 ...
 [ 1.77951304e+00  2.45118493e-02 -6.55113628e-02 ...  4.72511657e-01
  -1.13285721e-01 -4.99609344e-01]
 [ 1.77951304e+00 -8.69235526e-01  7.79450426e-01 ... -4.83372170e-02
   5.23112051e-01 -4.99609344e-01]
 [ 1.77951304e+00 -3.39607452e-01  6.58741599e-01 ...  6.35276931e-01
   8.13641468e-01 -4.99609344e-01]]


In [34]:
#Splitting Dataset into Training set and Test set
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=0)
print(X_train)

[[-0.56195149  0.42173291  0.44414813 ...  0.30974638  0.45393838
  -0.49960934]
 [-0.56195149 -1.13404956 -1.1653029  ... -1.18769413 -1.04021291
   2.00156384]
 [-0.56195149  1.62994695  1.154989   ...  1.18867886  1.11800562
  -0.49960934]
 ...
 [-0.56195149 -0.83613377 -0.54834667 ... -0.4552504  -0.43148461
  -0.49960934]
 [-0.56195149 -0.05824254  0.43073604 ...  0.2609168   0.45393838
  -0.49960934]
 [-0.56195149 -0.24030219 -1.20553917 ... -0.86216358 -1.51059387
  -0.49960934]]


In [35]:
print(Y_train)

[[0.]
 [0.]
 [0.]
 ...
 [0.]
 [0.]
 [1.]]


In [138]:
#Training Model
classifier = RandomForestClassifier(n_estimators=100,random_state=0)
classifier.fit(X_train,Y_train)
print(classifier.score(X_train,Y_train))
y_pred = le6.inverse_transform(np.array(classifier.predict(X_test),dtype=int))
Y_test1 = le6.inverse_transform(np.array(Y_test,dtype=int))
print(y_pred)

  This is separate from the ipykernel package so we can avoid doing imports until


0.9996875
['No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'No'
 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No'
 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No'
 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'Yes'
 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes'
 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No' '

  y = column_or_1d(y, warn=True)


In [139]:
print(Y_test1)

['No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No'
 'No' 'No' 'Yes' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No'
 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes'
 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes'
 'No' 'No' 'Yes' 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'Yes'
 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'No'
 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'Yes' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'No' 'No'
 'Yes' 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'No' 'Yes' 'No' 'No' 'No'
 'No' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'No' 'No' 'No' 'No' 'No'
 'No' 'Yes' 'No' 'No' 'No' 'No' 'No' 'Yes' 'No' 'Yes' 'Yes' 'Yes' 'No'
 'No' 'No' 'Yes' 'Yes' 'Yes' 'No' 'Yes' 'No' 'Yes' 'No

In [140]:
y_pred = y_pred.reshape(-1,1)
Y_test1 = Y_test1.reshape(-1,1)
df = np.concatenate((Y_test1,y_pred),axis=1)
dataframe = pd.DataFrame(df,columns=['Rain on Tommorrow','Prediction of Rain'])
print(dataframe)

    Rain on Tommorrow Prediction of Rain
0                  No                 No
1                  No                 No
2                  No                 No
3                  No                 No
4                  No                 No
..                ...                ...
795                No                 No
796                No                 No
797                No                 No
798               Yes                Yes
799               Yes                Yes

[800 rows x 2 columns]


In [141]:
rf_accuracy = accuracy_score(Y_test1,y_pred)
print("\nRandom Forest Accuracy: "+str(rf_accuracy))


Random Forest Accuracy: 0.87375


In [142]:
from sklearn.ensemble import BaggingClassifier
dt = BaggingClassifier(n_estimators=250,max_features=12)
dt.fit(X_train,Y_train)
print(dt.score(X_train,Y_train))
y_pred = le6.inverse_transform(np.array(dt.predict(X_test),dtype=int))
Y_test2 = le6.inverse_transform(np.array(Y_test,dtype=int))
y_pred = y_pred.reshape(-1,1)
Y_test2 = Y_test2.reshape(-1,1)
df = np.concatenate((Y_test2,y_pred),axis=1)
dataframe = pd.DataFrame(df,columns=['Rain on Tommorrow','Prediction of Rain'])
print(dataframe)
dt_accuracy = accuracy_score(Y_test1,y_pred)
print("\nBagging Classifier Accuracy: "+str(dt_accuracy))
#print(y_pred)
#print(Y_test)

  y = column_or_1d(y, warn=True)


1.0
    Rain on Tommorrow Prediction of Rain
0                  No                 No
1                  No                 No
2                  No                 No
3                  No                 No
4                  No                 No
..                ...                ...
795                No                 No
796                No                Yes
797                No                 No
798               Yes                Yes
799               Yes                Yes

[800 rows x 2 columns]

Bagging Classifier Accuracy: 0.8775


  y = column_or_1d(y, warn=True)


In [143]:
from sklearn.ensemble import GradientBoostingClassifier
dt = GradientBoostingClassifier(n_estimators=170,max_depth=1)
dt.fit(X_train,Y_train)
print(dt.score(X_train,Y_train))
y_pred = le6.inverse_transform(np.array(dt.predict(X_test),dtype=int))
Y_test3 = le6.inverse_transform(np.array(Y_test,dtype=int))
y_pred = y_pred.reshape(-1,1)
Y_test3 = Y_test3.reshape(-1,1)
df = np.concatenate((Y_test3,y_pred),axis=1)
dataframe = pd.DataFrame(df,columns=['Rain on Tommorrow','Prediction of Rain'])
print(dataframe)
dt_accuracy = accuracy_score(Y_test1,y_pred)
print("\nGradient Boosting Accuracy: "+str(dt_accuracy))
#print(y_pred)
#print(Y_test)

  y = column_or_1d(y, warn=True)


0.878125
    Rain on Tommorrow Prediction of Rain
0                  No                 No
1                  No                 No
2                  No                 No
3                  No                 No
4                  No                 No
..                ...                ...
795                No                 No
796                No                Yes
797                No                 No
798               Yes                Yes
799               Yes                Yes

[800 rows x 2 columns]

Gradient Boosting Accuracy: 0.88


  y = column_or_1d(y, warn=True)


In [144]:
import xgboost as xgb
xg = xgb.XGBClassifier(n_estimators=140, max_depth=12)
xg.fit(X_train,Y_train)
print(xg.score(X_train,Y_train))
y_pred = le6.inverse_transform(np.array(xg.predict(X_test),dtype=int))
Y_test4 = le6.inverse_transform(np.array(Y_test,dtype=int))
#print(y_pred)
#print(Y_test)
y_pred = y_pred.reshape(-1,1)
Y_test4 = Y_test4.reshape(-1,1)
df = np.concatenate((Y_test4,y_pred),axis=1)
dataframe = pd.DataFrame(df,columns=['Rain on Tommorrow','Prediction of Rain'])
print(dataframe)
dt_accuracy = accuracy_score(Y_test1,y_pred)
print("\nXGBoost Accuracy: "+str(dt_accuracy))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


1.0
    Rain on Tommorrow Prediction of Rain
0                  No                 No
1                  No                 No
2                  No                 No
3                  No                 No
4                  No                 No
..                ...                ...
795                No                 No
796                No                 No
797                No                 No
798               Yes                Yes
799               Yes                 No

[800 rows x 2 columns]

XGBoost Accuracy: 0.875
