# Automated Machine Learning

This day i wanna learn and try about Automated Machine Learning using TPOT, Tree-based Pipeline Optimization Tool, is a Python library for automated machine learning. TPOT uses a tree-based structure to represent a model pipeline for a predictive modeling problem, including data preparation and modeling algorithms and model hyperparameters.

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import io
from sklearn.preprocessing import LabelEncoder

In [None]:
data = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
data.head()

# Data exploration

In [None]:
data.describe()
print("The size of the dataframe is:",data.shape)

In [None]:
# check for null values
data_missing = data.isnull().sum()

# calculate the % of missing values
perc_missing = round(100*(data_missing/len(data)),2)
perc_missing

In [None]:
# dropping columns with large % of missing values 

data_dropped = data.drop(['Evaporation','Sunshine','Cloud9am','Cloud3pm'], axis=1)

In [None]:
# And we need to replace NaN values with mean values of each column:
data_dropped.fillna(data_dropped.mean(), inplace=True)

# Dropping the NaN values from the data as they can be problematic 
data_dropped.dropna(inplace=True)

data_dropped.isna().sum()

In [None]:
print("The new size of the dataframe is:", data_dropped.shape)
print("We deleted",data.shape[0]-data_dropped.shape[0],"rows and", data.shape[1]-data_dropped.shape[1],"columns.")
data_dropped.dtypes

In [None]:
# change date type to datetime

data_dropped['Date'] = pd.to_datetime(data_dropped['Date'])
# Adding columns Year and Month

data_dropped['Year'] = pd.to_datetime(data_dropped['Date']).dt.year
data_dropped['Month'] = pd.to_datetime(data_dropped['Date']).dt.month

# set Date as index

data_dropped.set_index('Date', inplace=True)
data_dropped.head()

## Data Visualisation 

In [None]:
plt.figure(figsize=(20,5))
data_dropped['Rainfall'].plot()
plt.box(False)
plt.title ('Rainfall throughout the Years',fontweight="bold", fontsize=15)

In [None]:
# plotting Rainfall per Month
plt.figure(figsize=(8,5))
sns.barplot(x = 'Month', y='Rainfall', data=data_dropped, color = 'skyblue')
plt.box(False)
plt.title ('Rainfall throughout Months', fontweight="bold",fontsize=15)

In [None]:
# plotting average Rainfall by Location
data_loc = data_dropped.groupby('Location').agg({'Rainfall':'mean'}).sort_values(by='Rainfall', ascending=False) 

data_loc.plot(kind='bar',figsize=(20,5))
plt.box(False)
plt.title ('Average Rainfall by Location', fontsize=15, fontweight="bold")
plt.show()

In [None]:
# Plotting Temperature and Rainfall

fig, (ax1,ax2) = plt.subplots(1, 2, figsize=(15, 5))
sns.despine(left=True)
sns.scatterplot(x='MinTemp', y='Rainfall', data=data_dropped, ax=ax1)
ax1.set_title("Lowest Temperature and Amount of Rainfall",fontweight="bold")
sns.scatterplot(x='MaxTemp', y='Rainfall', data=data_dropped, color="tomato", ax=ax2)
ax2.set_title("Highest Temperature and Amount of Rainfall",fontweight="bold")

## Data Preparation 

In [None]:
# Renaming Dataframe for the Machine Learning Part
data_ML = data_dropped

In [None]:
# Dropping columns that we do not need for the model building part
data_ML = data_ML.drop(['Location','Year'], axis=1)

In [None]:
# Adjusting the Target Variables' values: Yes/No with 1/0
data_ML = data_ML.replace({'RainTomorrow':'Yes','RainToday':'Yes'},1)
data_ML = data_ML.replace({'RainTomorrow':'No','RainToday':'No'},0)

In [None]:
# Using labelEncoder to assign numeric values to the string data , according to the label.
le = LabelEncoder()
data_ML['WindGustDir'] = le.fit_transform(data_ML['WindGustDir'])
data_ML['WindDir9am'] = le.fit_transform(data_ML['WindDir9am'])
data_ML['WindDir3pm'] = le.fit_transform(data_ML['WindDir3pm'])
data_ML.head()

In [None]:
# Correlation
# Create Correlation mask >0.5:
data_ML_corr = data_ML.corr()
condition = abs(data_ML.corr()) > 0.5
#data_ML_corr[condition]

In [None]:
# heatmap
# correlation plot
plt.figure(figsize=(20,20))
sns.heatmap(data_ML.corr(), cmap = 'Wistia')

In [None]:
# Dropping highly correlated columns

data_ML = data_ML.drop(['WindGustSpeed','Humidity9am',], axis=1)

## Feature Scaling 

In [None]:
# Standardize our Data - Feature Scaling 0-1 scale 

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler(feature_range=(0,1)) 

#assign scaler to column:
data_scaled = pd.DataFrame(scaler.fit_transform(data_ML), columns=data_ML.columns)

data_scaled.head()

## Feature Selection

In [None]:
# Selection of the most important features using SelectKBest
from sklearn.feature_selection import SelectKBest, chi2

X = data_scaled.loc[:,data_scaled.columns!='RainTomorrow']
y = data_scaled[['RainTomorrow']]

selector = SelectKBest(chi2, k=5)
selector.fit(X, y)

X_new = selector.transform(X)
print("The 5 most important features are:", X.columns[selector.get_support(indices=True)]) 

In [None]:
# Creating a new dataframe with the most important features

data_new = data_scaled[['Rainfall', 'Humidity3pm', 'Pressure9am', 'Temp3pm', 'RainToday',
                    'RainTomorrow']]

## Checking the Target variables' distribution

In [None]:
data_new['RainTomorrow'].value_counts()[0]

In [None]:
Percentage_No = data_new['RainTomorrow'].value_counts()[0]/len(data_new['RainTomorrow'])*100
Percentage_Yes = data_new['RainTomorrow'].value_counts()[1]/len(data_new['RainTomorrow'])*100

In [None]:
# checking the distribution of our target variable 
print(data_new['RainTomorrow'].value_counts())

print("Percentage Occurences of No Rain on the following day:", round(Percentage_No,2),"%")
print("Percentage Occurences of Rain on the following day:", round(Percentage_Yes,2),"%")

sns.countplot(data_new['RainTomorrow'])
plt.title('Balance target',fontsize=15, fontweight='bold')
plt.box(False)

In [None]:
data_new.shape

## Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

y = data_new['RainTomorrow']
X = data_new.drop(['RainTomorrow'], axis = 1)

# Train-Test Split 80-20
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2,stratify = y)

In [None]:
data_new.info

# TPOT Training

In [None]:
!pip install tpot

In [None]:
import time
from tpot import TPOTClassifier


# Construct and fit TPOT classifier
start_time = time.time()
tpot = tpot = TPOTClassifier(generations=5,verbosity=2,population_size=50,scoring='accuracy',
                             max_eval_time_mins=2,periodic_checkpoint_folder='/content/drive/MyDrive/Colab Notebooks/Checkpoint TPOT')

tpot.fit(X_train,y_train) 
end_time = time.time()

# Results
print('TPOT classifier finished in %s seconds' % (end_time - start_time)) 
print('Best pipeline test accuracy: %.3f' % tpot.score(X_test, y_test))

## Result

In [None]:
# Results
print('Best pipeline test accuracy: %.3f' % tpot.score(X_test, y_test))

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = tpot.predict(X_test)

print('MAE',mean_absolute_error(y_pred=y_pred, y_true=y_test))
print('MSE',mean_squared_error(y_pred=y_pred, y_true=y_test))
print('R2',r2_score(y_pred=y_pred, y_true=y_test))

In [None]:
import sklearn.metrics
y_predictions = tpot.predict(X_test)
acc= sklearn.metrics.accuracy_score(y_true=y_test,
                                     y_pred=y_predictions)
print("Accuracy:", acc)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_predictions))

In [None]:
data_test = pd.read_csv('/kaggle/input/weather-dataset-rattle-package/weatherAUS.csv')

In [None]:
import itertools

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        #print("Normalized confusion matrix")
    else:
        1#print('Confusion matrix, without normalization')

    #print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [None]:
from sklearn.metrics import confusion_matrix,precision_recall_curve,auc,roc_auc_score,roc_curve,recall_score,classification_report 

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_predictions,y_test)
np.set_printoptions(precision=2)

print("Recall metric in the testing dataset: ", cnf_matrix[1,1]/(cnf_matrix[1,0]+cnf_matrix[1,1]))

# Plot non-normalized confusion matrix
class_names = [0,1]
plt.figure()
plot_confusion_matrix(cnf_matrix
                      , classes=class_names
                      , title='Confusion matrix')
plt.show()