In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# For escalation of values
from scipy import stats

# For machine learning modeles
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.ensemble import  AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import preprocessing

# For the validation of models
from sklearn.metrics import accuracy_score, precision_score, recall_score
from numpy import mean

In [2]:
df = pd.read_csv('Weather.csv')
df.head()

Unnamed: 0,Date,Location,MinTemp,MaxTemp,Rainfall,Evaporation,Sunshine,WindGustDir,WindGustSpeed,WindDir9am,...,Humidity3pm,Pressure9am,Pressure3pm,Cloud9am,Cloud3pm,Temp9am,Temp3pm,RainToday,RISK_MM,RainTomorrow
0,2010-12-25,NorahHead,17.7,25.2,0.0,,,NNE,39.0,S,...,76.0,1017.7,1012.1,,,22.3,24.3,No,0.0,No
1,2013-06-08,Darwin,22.9,31.6,0.0,7.4,10.9,E,57.0,ESE,...,40.0,1012.6,1009.9,1.0,1.0,26.2,31.2,No,0.0,No
2,2009-04-27,GoldCoast,14.7,27.6,0.0,,,W,31.0,WNW,...,32.0,1014.9,1012.3,,,23.8,26.4,No,0.0,No
3,2011-09-04,GoldCoast,13.4,22.7,0.0,,,SSE,44.0,SSE,...,59.0,1029.0,1026.1,,,19.1,20.7,No,5.0,Yes
4,2015-01-06,Nhil,15.5,37.9,0.0,,,ESE,46.0,ESE,...,20.0,1018.0,1015.2,,,22.1,36.4,No,0.0,No


In [3]:
df = df.drop(columns='RISK_MM')
df.shape

(142133, 23)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142133 entries, 0 to 142132
Data columns (total 23 columns):
Date             142133 non-null object
Location         142133 non-null object
MinTemp          141496 non-null float64
MaxTemp          141811 non-null float64
Rainfall         140727 non-null float64
Evaporation      81350 non-null float64
Sunshine         74377 non-null float64
WindGustDir      132803 non-null object
WindGustSpeed    132863 non-null float64
WindDir9am       132122 non-null object
WindDir3pm       138355 non-null object
WindSpeed9am     140785 non-null float64
WindSpeed3pm     139503 non-null float64
Humidity9am      140359 non-null float64
Humidity3pm      138523 non-null float64
Pressure9am      128119 non-null float64
Pressure3pm      128152 non-null float64
Cloud9am         88528 non-null float64
Cloud3pm         85091 non-null float64
Temp9am          141229 non-null float64
Temp3pm          139407 non-null float64
RainToday        140727 non-null obje

In [5]:
df = df.drop(columns=['Location','Date', 'Evaporation', 'Sunshine', 'Cloud9am', 'Cloud3pm'], axis=1)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 142133 entries, 0 to 142132
Data columns (total 17 columns):
MinTemp          141496 non-null float64
MaxTemp          141811 non-null float64
Rainfall         140727 non-null float64
WindGustDir      132803 non-null object
WindGustSpeed    132863 non-null float64
WindDir9am       132122 non-null object
WindDir3pm       138355 non-null object
WindSpeed9am     140785 non-null float64
WindSpeed3pm     139503 non-null float64
Humidity9am      140359 non-null float64
Humidity3pm      138523 non-null float64
Pressure9am      128119 non-null float64
Pressure3pm      128152 non-null float64
Temp9am          141229 non-null float64
Temp3pm          139407 non-null float64
RainToday        140727 non-null object
RainTomorrow     142133 non-null object
dtypes: float64(12), object(5)
memory usage: 18.4+ MB


In [6]:
df.shape

(142133, 17)

In [7]:
df = df.dropna(how='any')
df.shape

(112867, 17)

In [8]:
z = np.abs(stats.zscore(df._get_numeric_data()))
# Print a table with z-scores
print(z)
# Removing outliers
df= df[(z < 3).all(axis=1)]
# Looking the new shape of dataframe
print(df.shape)

[[0.80465763 0.22111745 0.27648139 ... 0.42344658 0.76085566 0.31775201]
 [1.63615501 1.13748266 0.27648139 ... 0.73963492 1.37450989 1.32671114]
 [0.32494761 0.5647544  0.27648139 ... 0.39470219 0.99687652 0.62482653]
 ...
 [0.35692828 0.25138337 0.16026949 ... 2.37806536 0.33601812 0.4133778 ]
 [0.69843377 0.7668388  0.27648139 ... 0.3659578  1.14304591 0.63271675]
 [1.18842565 2.69816717 0.27648139 ... 1.01270666 1.65773491 2.81821596]]
(107810, 17)


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 107810 entries, 0 to 142132
Data columns (total 17 columns):
MinTemp          107810 non-null float64
MaxTemp          107810 non-null float64
Rainfall         107810 non-null float64
WindGustDir      107810 non-null object
WindGustSpeed    107810 non-null float64
WindDir9am       107810 non-null object
WindDir3pm       107810 non-null object
WindSpeed9am     107810 non-null float64
WindSpeed3pm     107810 non-null float64
Humidity9am      107810 non-null float64
Humidity3pm      107810 non-null float64
Pressure9am      107810 non-null float64
Pressure3pm      107810 non-null float64
Temp9am          107810 non-null float64
Temp3pm          107810 non-null float64
RainToday        107810 non-null object
RainTomorrow     107810 non-null object
dtypes: float64(12), object(5)
memory usage: 14.8+ MB


In [10]:
len(df.WindGustDir.value_counts())

16

In [11]:
# List of features that will be changed
winds = ['WindGustDir', 'WindDir3pm', 'WindDir9am']
    
# Doing the transformation with "get_dummies"
df = pd.get_dummies(df, columns=winds)

# Cheking the new shape
df.shape

(107810, 62)

In [12]:
df = df.drop(['WindGustDir_WSW', 'WindDir3pm_SSW', 'WindDir9am_NNE'], axis =1)
df.shape

(107810, 59)

In [13]:
df['RainToday'].replace({'No': 0, 'Yes': 1},inplace = True)
df['RainTomorrow'].replace({'No': 0, 'Yes': 1},inplace = True)

In [14]:
df.RainToday.value_counts()

0    85373
1    22437
Name: RainToday, dtype: int64

In [15]:
df.RainTomorrow.value_counts()

0    85417
1    22393
Name: RainTomorrow, dtype: int64

In [16]:
# Doing the escalation using "MinMaxScale" model
scaler = preprocessing.MinMaxScaler()
# Training the model
scaler.fit(df)
# Changing data 
df = pd.DataFrame(scaler.transform(df), index=df.index, columns=df.columns)
# Returning the data frama after the escalation
df.head()

  return self.partial_fit(X, y)


Unnamed: 0,MinTemp,MaxTemp,Rainfall,WindGustSpeed,WindSpeed9am,WindSpeed3pm,Humidity9am,Humidity3pm,Pressure9am,Pressure3pm,...,WindDir9am_NNW,WindDir9am_NW,WindDir9am_S,WindDir9am_SE,WindDir9am_SSE,WindDir9am_SSW,WindDir9am_SW,WindDir9am_W,WindDir9am_WNW,WindDir9am_WSW
0,0.63369,0.520988,0.0,0.438356,0.135135,0.52381,0.696629,0.76,0.505938,0.429257,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.772727,0.679012,0.0,0.684932,0.594595,0.666667,0.483146,0.4,0.384798,0.376499,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.553476,0.580247,0.0,0.328767,0.297297,0.309524,0.404494,0.32,0.43943,0.434053,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.518717,0.459259,0.0,0.506849,0.945946,0.666667,0.573034,0.59,0.774347,0.764988,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.574866,0.834568,0.0,0.534247,0.594595,0.261905,0.269663,0.2,0.513064,0.503597,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
# Splinting the data in features (X) and labels (y)
X = df.loc[:,df.columns!='RainTomorrow']
y = df[['RainTomorrow']]
# Using função SelectKBest and determining the parameters numbers of features, K = 58
selector = SelectKBest(chi2, k=58)
# Traning
selector.fit(X, y)
# Returning scores
scores = selector.scores_
# Creating a list for features names
lista = df.columns
lista = [x for x in lista if x != 'RainTomorrow']
# Creationg a dictionaty with the features name list and scores  
unsorted_pairs = zip(lista, scores)
sorted_pairs = list(reversed(sorted(unsorted_pairs, key=lambda x: x[1])))
k_best_features = dict(sorted_pairs[:58])

In [18]:
K_values = []
for key in k_best_features:
    if float(k_best_features[key]) >= float(0.01 * k_best_features['RainToday']):
        K_values.append(key)

In [19]:
df_predi = df[K_values + ['RainTomorrow']]
X = df[K_values]
y = df['RainTomorrow']

In [20]:
n_features_list = list(range(2,len(K_values)+1))

In [21]:
accuracy_LR=[]

In [22]:
for n in n_features_list:  
    
    # Splinting the values for the training and test sets with "train_test_split"
    # We will leave 20% of the data for test and the rest for training.
    features_train, features_test, labels_train, labels_test = train_test_split(df[K_values[:n]], y, test_size=0.2, random_state=42)

    # Applying Logistic Regression model
    l_clf = LogisticRegression()
    l_clf.fit(features_train, labels_train)
    # Doing the prediction
    prediction_lr = l_clf.predict(features_test)
    # Append the values of accuracy in a list
    accuracy_LR.append(accuracy_score(labels_test, prediction_lr))



In [23]:
X = df[K_values[:3]]
y = df['RainTomorrow']
features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
# Creating a list with parameters 
parameters = {'solver':('newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'), 'C':[0.01, 0.1, 10, 10**5,10**10,10**15,10**20],'tol':[10**-20,10**-15,10**-10,10**-5,0.01, 0.1, 10]}
# Applying the model
l_clf = LogisticRegression()
clf = GridSearchCV(l_clf, parameters)
clf.fit(features_train, labels_train)
# Outout of parameters 
best_l_clf = clf.best_estimator_
clf.best_estimator_









LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='sag',
          tol=0.01, verbose=0, warm_start=False)

In [25]:
l_clf = LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='saga', tol=0.1,
          verbose=0, warm_start=False)

In [26]:
def avaliacao_clf(clf, features, labels, n_iters=1000):
    print (clf)
    
    # Creating list for outputs
    accuracy = []
    precision = []
    recall = []
    first = True
    
    # Creating a loop to thousand interactions
    for tentativa in range(n_iters):
        
        # Splinting data to test and traing
        features_train, features_test, labels_train, labels_test = train_test_split(X, y, test_size=0.3)

        # Applying the model
        clf.fit(features_train, labels_train)
        predictions = clf.predict(features_test)
        # Appending accuracy
        accuracy.append(accuracy_score(labels_test, predictions))
        # Appending precision
        precision.append(precision_score(labels_test, predictions))
        # Appending recall
        recall.append(recall_score(labels_test, predictions))

    # Taking the average of metrics for evaluating and implementing the results

    print ("precision: {}".format(mean(precision)))
    print ("recall:    {}".format(mean(recall)))
    print ("accuracy:  {}".format(mean(accuracy)))
    
    return mean(precision), mean(recall), mean(accuracy)

In [27]:
avaliacao_clf(l_clf, X, y)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='saga', tol=0.1,
          verbose=0, warm_start=False)
precision: 0.7101899926952426
recall:    0.38167845371125464
accuracy:  0.8370619608570634


(0.7101899926952426, 0.38167845371125464, 0.8370619608570634)