# Problem statement:- predicting the rainfall - it will rain or not

#Rainfall forms the primary input to the river basin, affecting the water capacity a
#stream, particularly during the torrential rainfall event. Moreover, one of the major focuses of
#climate change study is to understand whether there are extreme changes in the occurrence and
#frequency of heavy rainfall events. The accuracy level of the ML models used in predicting
#rainfall based on historical data has been one of the most critical concerns in hydrological
#studies. An accurate ML model could give early alerts of severe weather to help prevent natural
#disasters and destruction. Hence, there is needs to develop ML algorithms capable in predicting
#rainfall with acceptable level of precision and in reducing the error in the dataset of the projected
#rainfall from climate change model with the expected observable rainfall.

In [1]:
# libraries required
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn import model_selection
from sklearn import metrics
from sklearn import linear_model
from sklearn import ensemble
from sklearn import tree
from sklearn import svm
import xgboost

In [2]:
data = pd.read_csv(r"C:\Users\prana\Documents\Rainfall ML Project\Rainfall.csv")

In [3]:
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RainTomorrow
0,2008-12-01,Delhi,13.4,22.9,0.6,,,W,44.0,W,...,71.0,22.0,1007.7,1007.1,8.0,,16.9,21.8,No,No
1,2008-12-02,Delhi,7.4,25.1,0.0,,,WNW,44.0,NNW,...,44.0,25.0,1010.6,1007.8,,,17.2,24.3,No,No
2,2008-12-03,Delhi,12.9,25.7,0.0,,,WSW,46.0,W,...,38.0,30.0,1007.6,1008.7,,2.0,21.0,23.2,No,No
3,2008-12-04,Delhi,9.2,28.0,0.0,,,NE,24.0,SE,...,45.0,16.0,1017.6,1012.8,,,18.1,26.5,No,No
4,2008-12-05,Delhi,17.5,32.3,1.0,,,W,41.0,ENE,...,82.0,33.0,1010.8,1006.0,7.0,8.0,17.8,29.7,No,No


In [4]:
data.isnull().mean() #Evaporation    Sunshine         Cloud9am           Cloud3pm         

Date             0.000000
Location         0.000000
MinTemp          0.010209
MaxTemp          0.008669
Rainfall         0.022419
Evaporation      0.431665
Sunshine         0.480098
WindGustDir      0.070989
WindGustSpeed    0.070555
WindDir9am       0.072639
WindDir3pm       0.029066
WindSpeed9am     0.012148
WindSpeed3pm     0.021050
Humidity9am      0.018246
Humidity3pm      0.030984
Pressure9am      0.103568
Pressure3pm      0.103314
Cloud9am         0.384216
Cloud3pm         0.408071
Temp9am          0.012148
Temp3pm          0.024811
RainToday        0.022419
RainTomorrow     0.022460
dtype: float64

In [5]:
# removing columns with more than 20% missing values and segregatting cat and num variables
data_cat = data[['RainToday','WindGustDir','WindDir9am','WindDir3pm']]
data.drop(columns=['Evaporation','Sunshine','Cloud9am','Cloud3pm'],axis=1,inplace=True)
data.drop(columns=['RainToday','WindGustDir','WindDir9am','WindDir3pm'],axis=1,inplace=True)

In [6]:
# filling the missing data of numeric variables with mean
data['MinTemp'].fillna(data['MinTemp'].mean(),inplace=True)
data['MaxTemp'].fillna(data['MaxTemp'].mean(),inplace=True)
data['Rainfall'].fillna(data['Rainfall'].mean(),inplace=True)
data['WindGustSpeed'].fillna(data['WindGustSpeed'].mean(),inplace=True)
data['WindSpeed9am'].fillna(data['WindSpeed9am'].mean(),inplace=True)
data['WindSpeed3pm'].fillna(data['WindSpeed3pm'].mean(),inplace=True)
data['Humidity9am'].fillna(data['Humidity9am'].mean(),inplace=True)
data['Humidity3pm'].fillna(data['Humidity3pm'].mean(),inplace=True)
data['Pressure9am'].fillna(data['Pressure9am'].mean(),inplace=True)
data['Pressure3pm'].fillna(data['Pressure3pm'].mean(),inplace=True)
data['Temp9am'].fillna(data['Temp9am'].mean(),inplace=True)
data['Temp3pm'].fillna(data['Temp3pm'].mean(),inplace=True)

In [7]:
# loading the names of categorical columns
cat_names = data_cat.columns

In [8]:
# intializing the simple imputer for missing categorical values
import numpy as np
from sklearn.impute import SimpleImputer
imp_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')

In [9]:
# fitting and transforming the missing data
data_cat = imp_mode.fit_transform(data_cat)

In [10]:
# converting array to dataframe
data_cat = pd.DataFrame(data_cat,columns=cat_names)

In [11]:
# concatinating the categorical and numeric data
data = pd.concat([data,data_cat],axis=1)

In [12]:
data.shape

(145460, 19)

In [13]:
data.isnull().sum()

Date                0
Location            0
MinTemp             0
MaxTemp             0
Rainfall            0
WindGustSpeed       0
WindSpeed9am        0
WindSpeed3pm        0
Humidity9am         0
Humidity3pm         0
Pressure9am         0
Pressure3pm         0
Temp9am             0
Temp3pm             0
RainTomorrow     3267
RainToday           0
WindGustDir         0
WindDir9am          0
WindDir3pm          0
dtype: int64

In [14]:
data.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,Temp9am,Temp3pm,RainTomorrow,RainToday,WindGustDir,WindDir9am,WindDir3pm
0,2008-12-01,Delhi,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,1007.1,16.9,21.8,No,No,W,W,WNW
1,2008-12-02,Delhi,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,1007.8,17.2,24.3,No,No,WNW,NNW,WSW
2,2008-12-03,Delhi,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,1008.7,21.0,23.2,No,No,WSW,W,WSW
3,2008-12-04,Delhi,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,1012.8,18.1,26.5,No,No,NE,SE,E
4,2008-12-05,Delhi,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,1006.0,17.8,29.7,No,No,W,ENE,NW


In [15]:
#importing the labelencoder
from sklearn.preprocessing import LabelEncoder

In [16]:
le = LabelEncoder()

In [17]:
#fitting and transforming the categorical data
data['Location'] = le.fit_transform(data['Location'])
data['RainToday'] = le.fit_transform(data['RainToday'])
data['WindGustDir'] = le.fit_transform(data['WindGustDir'])
data['WindDir9am'] = le.fit_transform(data['WindDir9am'])
data['WindDir3pm'] = le.fit_transform(data['WindDir3pm'])

In [18]:
# spliiting the date column into year,month,day
data[["year", "month", "day"]] = data["Date"].str.split("-", expand = True)

In [19]:
# removing the main column
data.drop(['Date'],axis=1,inplace=True)

In [20]:
data.head()

Unnamed: 0,Location,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,...,Temp9am,Temp3pm,RainTomorrow,RainToday,WindGustDir,WindDir9am,WindDir3pm,year,month,day
0,14,13.4,22.9,0.6,44.0,20.0,24.0,71.0,22.0,1007.7,...,16.9,21.8,No,0,13,13,14,2008,12,1
1,14,7.4,25.1,0.0,44.0,4.0,22.0,44.0,25.0,1010.6,...,17.2,24.3,No,0,14,6,15,2008,12,2
2,14,12.9,25.7,0.0,46.0,19.0,26.0,38.0,30.0,1007.6,...,21.0,23.2,No,0,15,13,15,2008,12,3
3,14,9.2,28.0,0.0,24.0,11.0,9.0,45.0,16.0,1017.6,...,18.1,26.5,No,0,4,9,0,2008,12,4
4,14,17.5,32.3,1.0,41.0,7.0,20.0,82.0,33.0,1010.8,...,17.8,29.7,No,0,13,1,7,2008,12,5


In [21]:
data.dropna(axis=0,how='any',inplace=True)

In [22]:
data.shape

(142193, 21)

In [23]:
y = data['RainTomorrow']
x = data.drop('RainTomorrow',axis=1)

In [24]:
from sklearn.preprocessing import StandardScaler

In [25]:
names = x.columns

In [26]:
sc = StandardScaler()

In [27]:
x = sc.fit_transform(x)

In [28]:
x = pd.DataFrame(x,columns=names)

In [29]:
from sklearn import model_selection

In [30]:
x_train,x_test,y_train,y_test = model_selection.train_test_split(x,y,test_size =0.2,random_state =0)

In [33]:
#Models intilization of the models
XGBoost = xgboost.XGBRFClassifier()
Rand_forest = ensemble.RandomForestClassifier()
svm = svm.SVC()
Dtree = tree.DecisionTreeClassifier()
GBM = ensemble.GradientBoostingClassifier()
log = linear_model.LogisticRegression()

In [34]:
# fitting the model
XGBoost.fit(x_train,y_train)
Rand_forest.fit(x_train,y_train)
svm.fit(x_train,y_train)
Dtree.fit(x_train,y_train)
GBM.fit(x_train,y_train)
log.fit(x_train,y_train)

ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got ['No' 'Yes']

In [54]:
# predicting the unknown values
p1 = XGBoost.predict(x_train)
p2 = Rand_forest.predict(x_train)
p3 = svm.predict(x_train)
p4 = Dtree.predict(x_train)
p5 = GBM.predict(x_train)
p6 = log.predict(x_train)

In [55]:
#checking the accuraccy score
print("xgboost:",metrics.accuracy_score(y_train,p1))
print("Rand_forest:",metrics.accuracy_score(y_train,p2))
print("svm:",metrics.accuracy_score(y_train,p3))
print("Dtree:",metrics.accuracy_score(y_train,p4))
print("GBM:",metrics.accuracy_score(y_train,p5))
print("log:",metrics.accuracy_score(y_train,p6))

xgboost: 0.8428802503648223
Rand_forest: 0.999973627301018
svm: 0.8571302987147704
Dtree: 1.0
GBM: 0.849069043725935
log: 0.8386606185277001


In [56]:
# predicting the test unknown values
t1 = XGBoost.predict(x_test)
t2 = Rand_forest.predict(x_test)
t3 = svm.predict(x_test)
t4 = Dtree.predict(x_test)
t5 = GBM.predict(x_test)
t6 = log.predict(x_test)

In [57]:
print("xgboost:",metrics.accuracy_score(y_test,t1))
print("Rand_forest:",metrics.accuracy_score(y_test,t2))
print("svm:",metrics.accuracy_score(y_test,t3))
print("Dtree:",metrics.accuracy_score(y_test,t4))
print("GBM:",metrics.accuracy_score(y_test,t5))
print("log:",metrics.accuracy_score(y_test,t6))

xgboost: 0.8420478919793242
Rand_forest: 0.8573437884595099
svm: 0.8525967861035901
Dtree: 0.7827982699813636
GBM: 0.8499947255529379
log: 0.8418369140968388


# Identifying the Best parameters combination using Gridsearchcv

In [58]:
model = ensemble.RandomForestClassifier()

In [63]:
parameters = {"max_features":["log2","sqrt"],
              'n_estimators': [5,10,15,20,25],
               'criterion':['gini','entropy']}

In [64]:
Grid_search = sklearn.model_selection.GridSearchCV(estimator=model,param_grid=parameters,cv=10)

In [65]:
Grid_search.fit(x_train,y_train)

GridSearchCV(cv=10, estimator=RandomForestClassifier(),
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_features': ['log2', 'sqrt'],
                         'n_estimators': [5, 10, 15, 20, 25]})

In [66]:
Grid_search.best_params_  # best combination,which gives better accuracy

{'criterion': 'entropy', 'max_features': 'sqrt', 'n_estimators': 25}

# Hyper-parameter tuned Model 

In [71]:
model = ensemble.RandomForestClassifier(criterion='entropy',max_features='sqrt',n_estimators = 25)

In [74]:
model.fit(x_train,y_train)
y_tp= model.predict(x_train)
sklearn.metrics.accuracy_score(y_train,y_tp)

0.9983648926631151

In [77]:
metrics.confusion_matrix(y_test,y_pred)

array([[20996,  1071],
       [ 3103,  3269]], dtype=int64)

In [76]:
y_pred= model.predict(x_test)
sklearn.metrics.accuracy_score(y_test,y_pred)

0.8532297197510461

In [78]:
import pickle

In [79]:
pickle.dump(model,open('rainfall.pkl','wb'))
pickle.dump(le,open('encoder.pkl','wb'))
pickle.dump(imp_mode,open('impter.pkl','wb'))