In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
data= pd.read_csv("/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv")
display(data.head(n=10))
display(data.tail(n=10))

**Implementation:Data Exploration**
We need to determine whether it will rain tomorrow or not. It's a yes for rain tomorrow if it is more than 1mm for that day. We need to compute the following:

* The total number of days, 'n_days'
* The number of days with atleast 1mm or more of rain, 'n_atleast_1mm'
* The number of days with leass than 1mm of rain, 'n_lessthan_1mm'
* Percentage of days with 1mm or more than 1mm of rain, 'rainy_days_percentage'

In [None]:
#Total number of days
n_days = len(data)

#The number of days with atleast 1mm or more of rain
n_atleast_1mm = data[data['Rainfall']>=1.0].count()[4]

# The number of days with less than 1mm of rain
n_lessthan_1mm = data[data['Rainfall']<1.0].count()[4]

#percentage of days with 1mm or more than 1mm of rain
rainy_days_percentage= (float(n_atleast_1mm)*100/float(n_days))

print(n_days,n_atleast_1mm,n_lessthan_1mm,rainy_days_percentage)

* Total number of days recorded = 145460
* Number of days with atleast 1mm or more of rain = 33639
* Number of days with atleast less than 1mm of rain = 108560
* rainy_days_percentage= 23.1259

Handling missing values by deleting columns with mostly null values such as Evaporation, Sunshine, Cloud9am , Cloud3pm

In [None]:
data.drop(["Evaporation","Sunshine","Cloud9am","Cloud3pm","Location"], inplace= True, axis =1)
display(data.head(n=10))

In [None]:
#deleting the rows with null values
data1= data.dropna(axis=0,how="any",thresh = None,subset=None, inplace=False)
data1_len= len(data1)
display(data1.head(n=100))

Good enough amout of data left, so we can afford to remove the rows with null values.
ToDo: Check for skewed columns in the data and transform them if the accuracy is not good.

In [None]:
#Normalizing numerical features
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
numerical = ['MinTemp','MaxTemp','Rainfall','WindGustSpeed','WindSpeed9am','WindSpeed3pm','Humidity9am','Humidity3pm','Pressure9am','Pressure3pm','Temp9am','Temp3pm']
features_log_minmax_transform = pd.DataFrame(data = data1)
features_log_minmax_transform[numerical] = scaler.fit_transform(data1[numerical])

# Show an example of a record with scaling applied
display(features_log_minmax_transform.head(n = 5))

In [None]:
#splitting the data into features and target label
rain_tomorrow_raw= data1['RainTomorrow']
features_raw = data1.drop('RainTomorrow',axis=1)

In [None]:
#Converting non numerical values to numerical values using one hot encoding
from sklearn.preprocessing import LabelEncoder

features_final = pd.get_dummies(features_raw)
le=LabelEncoder()
le.fit(rain_tomorrow_raw)

rain_tomorrow_final= le.transform(rain_tomorrow_raw)
encoded = list(features_final.columns)
display(features_final.head(n=100))


In [None]:
features_final = features_final.dropna(how='any',axis=0) 
features_final.isnull().sum().sum()
#print(len(features_final))

In [None]:
#Shuffle and split data
# Import train_test_split
from sklearn.model_selection import train_test_split


# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                    rain_tomorrow_final, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

# Show the results of the split
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))

In [None]:
#implementing random forset classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
model_rf= RandomForestClassifier()
model_rf.fit(X_train,y_train)
# Make predictions for the test set
y_pred_test = model_rf.predict(X_test)
# View accuracy score
metrics.accuracy_score(y_test, y_pred_test)

Implementing Adaboost classifier with feature importance

In [None]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier().fit(X_train,y_train)
#Extract the important features using .features_importances_
importances = model.feature_importances_
# Plot
column_importances = pd.Series(model.feature_importances_, index=features_final.columns)
column_importances.nlargest(10).plot(kind='barh')

In [None]:
#testing the accuracy of adaboost classifier
y_pred = model.predict(X_test)
metrics.accuracy_score(y_test, y_pred)


Implementing model tuning using GridSearch CV. I will be using RandomForestClassifer for this purpose

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import fbeta_score
from xgboost import XGBClassifier

#Initialize the classifier
clf= XGBClassifier()

#Parameters
parameters =  {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.05], #so called `eta` value
              'max_depth': [6],
              'min_child_weight': [11],
              'silent': [1],
              'subsample': [0.8],
              'colsample_bytree': [0.7],
              'n_estimators': [5], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [1337]}



scorer = make_scorer(fbeta_score,beta=0.5)

grid_obj = GridSearchCV(clf, parameters,scorer)

grid_fit = grid_obj.fit(X_train,y_train)



In [None]:

# Get the estimator
best_clf = grid_fit.best_estimator_

# Make predictions using unoptimized and optimized model
predictions = (clf.fit(X_train, y_train)).predict(X_test)
best_predictions = best_clf.predict(X_test)





In [None]:
from sklearn.metrics import accuracy_score
# Report the before-and-afterscores
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, best_predictions, beta = 0.5)))

**Created model to do rain prediction for tommorrow.Highest model accuracy is achieved by using XGBoost classifier. For unoptimized XGboost model accuray is 0.8576 and for optimized(using Grid Search CV) it is 0.8449. **