In [None]:
import copy
import seaborn
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
import matplotlib.transforms as mtransforms
from math import pi
from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
import warnings

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Data Analysis of "Rain in Australia" dataset

The data was automatically collected between 01-12-2008 and 25-06-2017 from 49 weather stations by Australian Bureau of Meteorology. Overall, there are 23 features and 145,460 observations. The features include 17 continuous variables and 6 discrete variables. 

In [None]:
# 1. Importing & pre-processing the data
data = pd.read_csv("../input/weather-dataset-rattle-package/weatherAUS.csv")
target = 'RainTomorrow'
data.rename(columns = {target: 'Target'}, inplace = True)
data[["Date"]] = data[["Date"]].astype('datetime64[ns]')
peek = data.head(5)
display(peek)

The dataset is imbalanced with regards to the target variable “Rain Tomorrow”: there are 31,877 observations of ‘Yes’ and 110,316 observations of ‘No’. The class distribution is skewed towards “No rain”. Assuming that the data was collected consistently and without measurement errors, the skew signifies that rainy days are approximately 3 times less frequent than days without rain in Australia. 

In [None]:
class_counts = pd.DataFrame(data.groupby('Target').size(), columns=["Observations"])
display(class_counts)

###  1. Data quality and integrity

By grouping the data by each weather station (“Location” variable), we can compare the timeframes when the data was collected. As it can be seen, most stations started collecting the data from 2009. Canberra has collected data for the longest period – 3,524 days, while Katherine, Uluru and Nhil collected the data for the shortest period – 1,577 days, starting only in Q1 of 2013.

In [None]:
def timeline(data,start,end,y,color="grey",ascending=False):
    data = data.sort_values("count",axis=0,ascending=ascending)
    index = data.index.tolist()    
    fig, ax = plt.subplots(figsize=(10,data.shape[0]*.25))
    days = data[end] - data[start]
    
    plt.hlines(data[y],data[start],data[end],linewidth=9, color='lightgrey')
    for i in range(len(data)):
        ax.text(mdates.date2num(data[start][index[i]])-200,i-.3, '%d' % (days[index[i]].days), color=color, fontweight="bold") 
    ax.spines[["right", "top", "left", "bottom"]].set_visible(False)
    ax.xaxis.grid(True,linestyle=":",color='black')
    ax.set_axisbelow(False)
    
data_by_station = data.groupby('Location').size()
station_stats = []
for station in data_by_station.items():
    station_data = data.loc[data["Location"] == station[0]]
    station_name = station_data["Location"].iloc[0]
    start_date = station_data.iloc[0]["Date"]
    end_date = station_data.iloc[-1]["Date"]
    datapoints_count = station[1]
    datapoints_missing = max(station_data.isnull().sum())
    missing_to_all_ratio = round(datapoints_missing/datapoints_count*100)
    station_stats.append([station_name,start_date,end_date,datapoints_count,datapoints_missing,missing_to_all_ratio])  
station_stats = pd.DataFrame(station_stats, columns=["name","start","end","count","missing","missing_prct"])

timeline(station_stats,'start','end','name')

Then, we can investigate the amount of missing information in each weather station. As it can be seen, approximately half of the weather stations has 100% of observations that miss at least one value. Melbourne Airport has the least amount of observations with missing values - only 1%.

In [None]:
def plotBarH(data,x,y,title,color="grey",ascending=False):
    data = data.sort_values(x,axis=0,ascending=ascending)
    index = data.index.tolist()
    fig, ax = plt.subplots(figsize=(10,data.shape[0]*.25))
    
    if ascending: a = -1
    else: a = 0
    padding = 0.01 * data[x][index[a]]
    
    for i in range(len(data)):
        ax.text(data[x][index[i]]+padding,i-.3, '%.2f %%' % (data[x][index[i]]), color=color, fontweight="bold", fontsize=10) 
        
    plt.barh(data[y],data[x], color=color)
    ax.spines[["right", "top", "left", "bottom"]].set_visible(False)
    plt.title(title, size=12, weight="bold")
    
plotBarH(station_stats,'missing_prct','name',"Missing Values by Station")

The plot below shows the proportion of missing values in each feature across all weather stations. ‘Sunshine’ and ‘Evaporation’ features have the largest proportion of missing values - 48% and 43% respectively.

In [None]:
missing_data = pd.DataFrame([(
    # Feature name
    data.columns[i],
    # Sum of missing values
    data.iloc[:,i].isnull().sum(), 
    # Percentage of missing values
    round(data.iloc[:,i].isnull().sum() / data.shape[0] * 100,2)) 
    for i in range(data.shape[1])],
    columns=["variable","missing","missing_prct"] )

plotBarH(missing_data,'missing_prct','variable', "Missing Values by Feature")

### 2. Correlation and feature selection

To select the most important features, correlation matrix can be analysed. As it was expected, the same measurements taken on different times of the day were correlated between each other. The features can be broken into the following correlated groups:
* Humidity9am and Humidity3pm
* MinTemp, MaxTemp, Temp9a and Temp3pm
* Pressure 9am and Pressure 3pm,
* Cloud 9am and Cloud 3pm,
* RainToday and Rainfall,
* WindSpeed 9am and WindGustSpeed

As the features within each group are correlated, it is reasonable to assume that each additional feature within the group does not provide new information for the purpose of discriminating the target class.

In [None]:
warnings.filterwarnings('ignore')
def factorize(data):
    for i in range(0,len(data.dtypes)):
        if str(data.dtypes[i]) == 'object':
            data.iloc[:,i], u = pd.factorize(data.iloc[:,i])
    return data

fac_data = copy.deepcopy(data)
fac_data[["WindGustDir","WindDir9am","WindDir3pm","RainToday","Target"]] = factorize(data[["WindGustDir","WindDir9am","WindDir3pm","RainToday","Target"]])

fig,ax = plt.subplots(figsize=(15,8))
seaborn.heatmap(ax=ax,data=fac_data.corr().round(2),annot=True,cmap=seaborn.diverging_palette(220,20),linewidth=2)
plt.title("Correlation Matrix", size=12, weight="bold")
""""""

The features that had the strongest positive correlation with the target variable are: “Cloud3pm”, “Humidity3pm”, “RainToday” and “WindGustSpeed”; and the features with strongest negative correlation with the target are: “Sunshine”, “Pressure9am” and “Temp3pm”. In order to verify whether those are the most important features, I have used Decision Tree, Random Forest and Adaptive Boosting algorithms to get importance scoring. Before running the above algorithms, I have removed all observations with missing values and performed random undersampling to balance the dataset. For tree-based algorithms the scaling of variables is not required, because the trees are not sensitive to range differences between features. The classification algorithms were performed on 12,427 data points.

Decision Tree is a greedy algorithm: for each subsequent branch it selects the feature that contributes to the highest reduction of entropy.

In [None]:
# Removing observations with missing missing values 
fac_data.dropna(inplace=True)

# Feature importance
def importance(X,y,fun):
    model = fun
    model.fit(X, y)
    data = pd.DataFrame([list(X.columns), list(model.feature_importances_)]).T
    data.columns = ["Feature Names", "Importance"]
    return data

X = fac_data.drop(['Location','Date','Target'], axis=1)
y = fac_data['Target']

# Random undersampling
undersample = RandomUnderSampler(sampling_strategy='majority')
X, y = undersample.fit_resample(X,y)

topFeatures = importance(X,y,DecisionTreeClassifier())
plotBarH(topFeatures,'Importance','Feature Names',"Feature Importance Score (Decision Tree)",ascending=True)

Random Forest algorithm uses ensemble of uncorrelated Decision Trees. Each of those trees are built using square root of p randomly selected features (where p is the total number of features). The scoring shows the extent by which selected features reduces impurity across the trees in the ensemble. The advantage of Random Forest is that by selecting small sample of predictors in each iteration, the weaker predictors may be allowed at the higher level of tree, thus reducing the variance.

In [None]:
topFeatures = importance(X,y,RandomForestClassifier())
plotBarH(topFeatures,'Importance','Feature Names',"Feature Importance Score (Random Forest)",ascending=True)

Adaptive Boosting iteratively assigns the weights to each stump that splits on each feature. The weights are calculated by evaluating proportion of misclassified samples in each stump and assigning higher penalty on misclassified samples in the next iteration. The resulting weights of stumps are then used to give scoring for each feature.

In [None]:
topFeatures = importance(X,y,AdaBoostClassifier())
plotBarH(topFeatures,'Importance','Feature Names',"Feature Importance Score (Adaptive Boosting)",ascending=True)

### 3. Key dimensions

In order to achieve comrehensible and efficient visualisation, while showing distribution of all datapoints, representative sample can be used instead of the main dataset. With the least amount of missing values, the sample collected from Melbourne Airport is a good candidate for being representative of the whole ‘Rain in Australia’ dataset. To confirm this choice, The comparison of the distrubution of data in each feature in both Melbourn Airport subset and ‘Rain in Australia’ should be analysed. From the plot below, we can see that apart of three features representing the direction of wind (“WindGustDir”, “WindDir9am”, WindDir3pm”), the distribution of both datasets are similar. 

In [None]:
def distPlot(data1,data2):
    y,x = 5,5
    fig, axes = plt.subplots(x,y,figsize=(12,12))
    features = data1.columns
    features = features.drop(["Date","Location","RainToday"])
    for i in range(len(features)):
        seaborn.kdeplot(data1[features[i]],fill=False, ax=axes[i//y,i%x], color='red')
        seaborn.kdeplot(data2[features[i]], fill=False, ax=axes[i//y,i%x], color='gray')
        axes[i//y,i%x].title.set_text(features[i])
        axes[i//y,i%x].set_ylabel("")
        axes[i//y,i%x].set_xlabel("")
    fig.tight_layout(pad=2.0)
    
fac_melbourne = fac_data.loc[fac_data["Location"] == "MelbourneAirport"]
    
distPlot(fac_melbourne,fac_data)

#### *A. Sunshine*

Given that the dataset is time series, it is important to include time dimension in the feature analysis. The swarmplot distributions below show number of hours of sunshine per day on yearly basis. The colours represent the target class, with orange indicating ‘No rain tomorrow’ and blue – ‘Rain tomorrow’. As it can be seen, the feature space can be split at approximatelty 7 hour threshold: if there is a less than 7 hour of sunshine on the day, there is higher likelihood of rain the next day. The dimension has a range of 0 to 14.5 hours with mean of 7.6 hours and median 8.4 hours. The standard deviation is 3.78 h. To get meaningful insight into changes throughout the year, looking at the same datapoints on the monthly basis is more useful. It can be observed that in winter time, there is considerably higher amount of sunny days and the rainy days are mainly concentrated in summer period.

In [None]:
def timeSeriesPrep(data):
    data["Year"] = data.Date.dt.year
    data["Month"] = data.Date.dt.month_name()
    data["Day"] = data.Date.dt.day
    cats = ['January', 'February', 'March', 'April','May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
    data['Month'] = pd.Categorical(data['Month'], ordered=True, categories=cats)
    return data

def swarmPlot(data,x,y,hue,size):
    fig, ax = plt.subplots(figsize=(20,5))
    ax.spines[["right", "top", "left", "bottom"]].set_visible(False)
    ax.yaxis.grid(True,linestyle=":",color='black')
    ax = seaborn.swarmplot(data=data[[x,y,hue]],x=x,y=y,hue=hue,size=size)
    ax.get_legend().remove()
    plt.title(y, size=12, weight="bold")

melbourne = data.loc[data["Location"] == "MelbourneAirport"]
melbourne = timeSeriesPrep(melbourne)
swarmPlot(melbourne,"Year","Sunshine","Target",4.5)

In [None]:
swarmPlot(melbourne,"Month","Sunshine","Target",3.5)

#### *B. Humidity at 3pm*

From swarmplot of “Humidity3pm”, it can be seen that the classes can be separated at the threshold of approximatelty 70%: the higher humidity increases likelihood of rain the next day. It can be also seen that the range of humidity is narrower in summer than in the winter, with summer being more humid. The overall range is from 0 to 100% with mean of 51.5% and median 52%. The standard deviation is 21%.

In [None]:
swarmPlot(melbourne,"Month","Humidity3pm","Target",3)

#### *C. Pressure at 3pm*

The swarmplot “Pressure3pm” shows that the pressure is higher in summer period than in winter period. The threhold that separates two classes can be seen at approximately 1010 hpa in winter and 1020 hpa in summer. The dimension has a range of 977 to 1039 hpa with mean and median of 1015 hpa. The standard deviation is 7 hpa.

In [None]:
swarmPlot(melbourne,"Month","Pressure3pm","Target",3.5)

#### *D. Temperature at 9am*

Feature ‘Temp9am’ scored significantly lower in tree based algorithms in the previous section. As expected, from the swarmplot we cannot see clear target class separation. It can be also observed that temperature swings are significantly wider in the winter time than in the summer time. The dimension has a range of -7.2 to 40.2 hpa with mean of 17 degrees and median of 16.7 degrees celsius. The hottest temperature at 9am was recorded in Pearce on 12th Janurary 2014 and the coldest temperature was recorded in Mount Ginini on 13th July 2016. The standard deviation is 6.5 degrees celsius.

In [None]:
swarmPlot(melbourne,"Month","Temp9am","Target",3)

#### *E. Wind Gust Direction*

‘WindGustDir’ has high entropy across all weather stations and it scored poorly with tree-based methods. It can be observed that the frequency of wind gust directions is evenly distributed in Australia, with North-East direction being slightly less frequent and West direction being the dominant with 7.5% of all occurances. Given high entropy, it is no surprise that the probability of the rain on the following day is similar in all wind gust directions, with marginally higher probability of rain after Northwest wind gusts. As it has been noted from distribution analysis in previous section, WindGustDir in Melbourne is less equally distributed than in the whole Australia. It can be seen that the North is the most frequent wind gust direction in Melbourne and the East direction is almost absent. Accordingly, the information gain from east direction is larger and the probability of raining the day after wind gust direction from East is much higher.

In [None]:
def polar_data(data,variable):
    
    directions = ['E', 'ENE', 'NE', 'NNE','N', 'NNW', 'NW', 'WNW', 'W', 'WSW', 'SW', 'SSW','S','SSE','SE','ESE']
    
    #Probability of rain per direction
    rain_probability = data.groupby(variable).apply(lambda x: x[x=='Yes'].count()/len(x))[["Target"]]  
    
    #Frequency of rain per direction
    rain_count = dict()
    for c in data.groupby(variable).size().items():
        rain_count[c[0]] = c[1]
    
    #Frequency and probability ordered
    df = [[direction, rain_count[direction], round(rain_probability.loc[direction]*100,1)] for direction in directions]    
    N = len(df)
    
    _, frequency, probability = zip(*df)
    frequency += frequency[:1]
    probability += probability[:1]
    
    #Polar plot angles
    angles = [n / float(N) * 2 * pi for n in range(N)]
    angles += angles[:1]
    
    return directions, frequency, probability, angles

def polar_plot(polar_d,title,color,prob=False,size=7):
    
    directions, frequency, probability, angles = polar_d

    if prob: values = probability
    else: values = frequency
    
    fig = plt.figure(figsize=(size,size))
    ax = plt.subplot(111, polar=True)
    plt.title(title, size=12, weight="bold")
    ax.set_yticklabels([])
    ax.plot(angles, values,'o-',color=color, linewidth=1, linestyle='solid')
    ax.fill(angles, values, color, alpha=0.1)
    plt.xticks(angles[:-1], directions, color='black', size=10)
    trans_offset = mtransforms.offset_copy(ax.transData, fig=fig,x=2, y=2, units='inches')
        
    for x, y in zip(angles, values):
        if prob: plt.text(x, y, '%d %%' % (int(y)), color=color)
        else: plt.text(x, y, '%d' % (int(y)), color=color)
            
polar_plot(polar_data(data,"WindGustDir"),"Australia (frequency)","brown", prob=False)

In [None]:
polar_plot(polar_data(data,"WindGustDir"),"Australia (probability)","darkblue", prob=True)

In [None]:
polar_plot(polar_data(melbourne,"WindGustDir"),"Melbourne (frequency)","brown", prob=False)

In [None]:
polar_plot(polar_data(melbourne,"WindGustDir"),"Melbourne (probability)","darkblue", prob=True)

#### *F. Wind Gust Speed*

‘WindGustSpeed’ is the 4th most important predictor according to Decision Tree and AdaBoost algorithms. From the boxplot, it can be concluded that the higher ‘WindGustSpeed’ approximately above 55 km/h increases likelihood of rain the next day. The dimension has a range of 6 km/h to 135 km/h with mean of 40 km/h and median of 39 km/h. The standard deviation is 13.6 km/h.

In [None]:
def box(data,x,y,hue):
    fig, ax = plt.subplots(figsize=(20,5))
    ax.spines[["right", "top", "left", "bottom"]].set_visible(False)
    ax.yaxis.grid(True,linestyle=":",color='black')
    ax = seaborn.boxplot(data=data[[x,y,hue]],x=x,y=y,hue=hue)
    ax.get_legend().remove()

box(melbourne,"Month","WindGustSpeed","Target")

#### *G. Cloud cover at 3pm*

‘Cloud3pm’ scored higher in Random Tree classifier, than in the Decision Tree algorithm. From the boxplot, it can be seen that the likelihood of rain the next day increases with median cloud coverage being above approximately 7 octas. The dimension has a range of 0 to 9 octas with mean of 4.5 and median of 5 octas. The standard deviation is 2.72 octas.

In [None]:
box(melbourne,"Month","Cloud3pm","Target")

### 4. Feature selection

From previous sections, the ‘Sunshine’, ‘Humidity3pm’ and ‘Pressure3pm’ were ranked as the strongest predictors for ‘Rain Tomorrow’ target class.

In [None]:
fig, axes = plt.subplots(1,3,figsize=(24,8))

seaborn.scatterplot(
    x=data['Humidity3pm'], 
    y=data['Pressure3pm'], 
    hue=data['Target'], 
    palette={"No":'tab:orange',"Yes":'tab:blue'}, 
    ax=axes[0], 
    alpha=0.1,
    size=1,
    legend=False
)

seaborn.scatterplot(
    x=data['Pressure3pm'], 
    y=data['Sunshine'], 
    hue=data['Target'], 
    palette={"No":'tab:orange',"Yes":'tab:blue'}, 
    ax=axes[1], 
    alpha=0.1,
    size=1,
    legend=False
)

seaborn.scatterplot(
    x=data['Humidity3pm'], 
    y=data['Sunshine'], 
    hue=data['Target'], 
    palette={"No":'tab:orange',"Yes":'tab:blue'}, 
    ax=axes[2], 
    alpha=0.1,
    size=1,
    legend=False
)

From scatterplots of 3 selected features, we can observe a good separation of classes. The colours represent the target class, with orange indicating ‘No Rain Tomorrow’ and blue - ‘Rain Tomorrow’. The first and third scatterplot show clear separation of target classes at approximately 65% thershold of humidity level at 3pm: the humidity above that threshold significantly increases chances of rain the next day. Similarly, pressure below 1005 hpa signals higher probability of the rain the next day. Although sparsity of datapoints below 1005 hpa threshold indicates that in general pressure below that level occurs rarely in Australia. Finally, sunshine for more than 7 hours during the day reduces probability of rain the next day. Negative correlation between ‘Sunshine’ and ‘Humidity’ can be also noticed, which may be explained by the fact that the sunshine may reduce the level of humidity.

To investigate the effect that the feature selection has on predicitive accuracy, I have tested the accuracy of SVM prediction on 3 datasets containing 8 features, 3 features and 2 features. Before training SVM classifier I have standardised data to standard deviation 1 and mean 0. Then, I have balanced the data by Random Undersampling and held out 25% of test data for measuring accuracy. I have used cross-validation grid search to select the best hyperparameters. Radial kernel with C100 and Gamma 0.001 worked best for 8 and 3 dimensional dataset, while linear kernal with C10 was selected for 2 dimensional feature space. As expected, the accuracy decreased with lower amount of features. Interestingly, reducing dimensions from 8 features to 3 features lowered the accuracy by only 1.08%, while reducing dimensions from 3 features to 2 features cut the accuracy by 3.09%. This implies strong predicitive qualitities of three selected features ‘Sunshine’, ‘Humidity3pm’ and ‘Pressure3pm’ if compared with other features.

In [None]:
def svm(X,y):
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    undersample = RandomUnderSampler(sampling_strategy='majority', random_state=0)
    X, y = undersample.fit_resample(X,y)
    
    tuned_parameters = [
        {'kernel': ['rbf'], 'gamma': [1e-3],'C': [100]},
        #{'kernel': ['linear'], 'C': [10]}
        ]
    
    classifier = GridSearchCV(SVC(), tuned_parameters, scoring='%s_macro' % 'precision')

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
    classifier.fit(X_train, y_train)
    
    predictions = classifier.predict(X_test)
    
    return round(accuracy_score(y_test,predictions)*100,2)
    
clean_data = data.dropna(inplace=False)

accuracy, title = [],[]

title.append("8 Features")
X = clean_data[['Humidity3pm','Sunshine','Pressure3pm','WindGustSpeed','Cloud3pm','Rainfall','MinTemp','Evaporation']]
y = clean_data['Target']
accuracy.append(svm(X,y))

title.append("3 Features")
X = clean_data[['Humidity3pm','Sunshine','Pressure3pm']]
y = clean_data['Target']
accuracy.append(svm(X,y))

title.append("2 Features")
X = clean_data[['Humidity3pm','Pressure3pm']]
y = clean_data['Target']
accuracy.append(svm(X,y))

score = pd.DataFrame(np.array([accuracy]).T, columns=["Accuracy (%)"], index=title)
display(score)

### 5. Eigenvector Decomposition 

In order to reduce number of data dimensions while preserving predictive qualities, we can look into eigenvector decomposition. In PCA. the first Principal Component is fit in a way that maximizes variance of projected data. Therefore, to avoid larger ranges distorting the fit, it is important to scale features before the decomposition. After verifying that all selected features have Gaussian distribution, we can standardise the data to mean 0 and standard deviation of 1. The linear combination matrix below shows the decomposed Principle Components and respective weight of each feature.

In [None]:
X = clean_data[['Humidity3pm','Sunshine','Pressure3pm','WindGustSpeed','Cloud3pm','Rainfall','MinTemp','Evaporation']]
y = clean_data['Target']

scaler = StandardScaler()
X = scaler.fit_transform(X)
X = pd.DataFrame(X, columns=['Humidity3pm','Sunshine','Pressure3pm','WindGustSpeed','Cloud3pm','Rainfall','MinTemp','Evaporation'])

pca = PCA()
pca.fit(X,y)
x_new = pca.transform(X)
pca_comp_T = pca.components_.T

PC_weights = pd.DataFrame(pca_comp_T,columns=['PC'+str(i+1) for i in range(0,8)], index=list(X.columns))
display(PC_weights.iloc[:,:3])

Biplot (or loading plot) shows the influence each feature has on first two Principal Components. It can be seen that ‘Humidity3pm’, ‘Cloud3pm’ and ‘Sunshine’ have significant weight on PC1, while ‘Pressure3pm’, ‘WindGustSpeed’ and ‘MinTemp’ have higher weight on PC2. The weights describe the influence each feature has on each principal component. These can be also examined by looking at linear combination matrix (above). From linear combination matrix follows, for example, that ‘Sunshine’ has the highest absolute weight on PC1 (0.54) and WindGustSpeed has the lowest absolute weight on PC1 (0.03). It can be also observed that ‘Humidity3pm’, ‘Cloud3pm’ and ‘Rainfall’ have strong positive correlation between each other and ‘Humidity3pm’ and ‘Sunshine’ have strong negative correlation.

In [None]:
def biplot(score, coeff , y, variables,size,colors,alpha,margins,pc1,pc2,extent=4):
    variables = list(variables)
    xs = score[:,0] # projection on PC1
    ys = score[:,1] # projection on PC2
    n = coeff.shape[0] # number of variables
    fig, ax = plt.subplots(figsize=(15,15))
    classes = np.unique(y)
    colors = {0:'tab:orange',1:'tab:blue'}
    for s,l in enumerate(classes):
        plt.scatter(xs[y==l],ys[y==l], marker='o', s=size, alpha=alpha, c=colors[s]) 
    plt.margins(x=margins[0], y=margins[1]) 
    
    for i in range(n):
        #plot as arrows the variable scores (each variable has a score for PC1 and one for PC2)
        plt.arrow(0, 0, coeff[i,pc1-1]*extent, coeff[i,pc2-1]*extent, color = 'k', alpha = 0.9,linestyle = '-',linewidth = 0.8, overhang=0.5)
        plt.text(coeff[i,pc1-1]* extent, coeff[i,pc2-1] * extent, variables[i], color = 'k', ha = 'center', va = 'center',fontsize=12,weight='bold')

    plt.xlabel("PC{}".format(pc1), size=12,weight='bold')
    plt.ylabel("PC{}".format(pc2), size=12,weight='bold')
    ax.spines[["right", "top"]].set_visible(False)

PC1 = 1
PC2 = 2
biplot(
    #PCAs, Feature vectors, labels, column names
    x_new[:,PC1-1:PC2], pca_comp_T, y, X.columns,
    #size, colours, alpha, margins xy, PCs 
    5,plt.cm.Set1, 0.3,[-0.3,0],PC1,PC2,
    )

From the Screeplot below, it can seen that the cumulative variance explained by 2 Principal Components is 57.86%. The accuracy of SVM prediction based on first two Pricipal Components is 78.53%, which is considerably higher than the predictive accuracy of 2 selected features (75.76%), but is slightly lower than the predictive accuracy of all 8 features (79.93%). This shows that PC1 and PC2 have absorbed high level of predictive power from 8 features, which allows 2 PCs to achieve better performance than 2 best features.

In [None]:
def screeplot(var_explained):
    pca_no = len(var_explained)
    fig, ax = plt.subplots(figsize=(pca_no,5))
    
    plt.plot(var_explained,linestyle='--',marker='o')
    ax.spines[["right", "top", "left", "bottom"]].set_visible(False)
    ax.xaxis.grid(True,linestyle=":",color='black')
    ax.yaxis.grid(True,linestyle=":",color='black')
    plt.xticks(np.arange(0,pca_no,step=1),['PC'+str(i+1) for i in range(0,pca_no)])

    for i in range(pca_no):
        ax.text(i,var_explained[i]+0.005, '%10.2f%%' % (var_explained[i]*100), color='tab:blue')

    plt.xlabel("Principal Components", size=10)
    plt.ylabel("Explained Variance", size=10)
    plt.title("Scree Plot")

screeplot(pca.explained_variance_ratio_)

The table below summarizes predictive accuracies achieved on held-out datapoints. We can see that by selecting 3 main features “Humidity3pm”, “Pressure3pm” and “Sunshine”, it is possible to maintain the most of predictive power. It can be also observed that decomposing data into PCs maintains higher accuracy than selecting the same amount of the strongest features: 78.98% compared to 78.36% for 3-dimensional datasets and 78.50% compared to 75.76% for 2-dimensional datasets.

In [None]:
pca_matrix = pd.DataFrame(x_new,columns=['PC'+str(i+1) for i in range(0,8)])

title.append("3 PCs")
accuracy.append(svm(pca_matrix.iloc[:,:3], y))

title.append("2 PCs")
accuracy.append(svm(pca_matrix.iloc[:,:2], y))

score = pd.DataFrame(np.array([accuracy]).T, columns=["Accuracy (%)"], index=title)
display(score)

3D plot showing relationship of ‘Humidity3pm’, ‘Pressure3pm’, ‘Sunshine’ dimensions

In [None]:
def plot3D(features,target,elev,azim,title,size=40,alpha=0.9,edgecolor="white"):
    plot = Axes3D(plt.figure(1,figsize=(18,18)), elev=elev, azim=azim)
    classes = np.unique(target)
    colors = {"No":'tab:orange',"Yes":'tab:blue'}
    target = target.replace(colors)
    plot.scatter(features[0],features[1],features[2],c=target, edgecolor=edgecolor, s=size, alpha=alpha)
    plot.set_xlabel(features[0].name)
    plot.set_ylabel(features[1].name)
    plot.set_zlabel(features[2].name)
    plt.title(title, size=12, weight="bold")
    plt.show()

plot3D(
       #features
       (X['Sunshine'],X['Pressure3pm'],X['Humidity3pm']),
       #target, elevation, azimuth
       y,-170,170,
        "Humidity3pm, Pressure3pm, Sunshine"
      )

3D plot showing relationship of ‘PC1’, ‘PC2’, ‘PC3’ dimensions

In [None]:
plot3D(
       #features
       (pca_matrix.iloc[:,0],pca_matrix.iloc[:,1],pca_matrix.iloc[:,2]),
       #target, elevation, azimuth
       y,-170,75,
    "PC1, PC2, PC3"
      )