## 1. Take one of the supervised learning models you have built recently and apply at least
three dimensionality reduction techniques to it (separately). Be sure to create a short
summary of each technique you use. Indicate how each changed the model
performance. Reference:
https://machinelearningmastery.com/dimensionality-reduction-algorithms-with-python/


In [1]:
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
#sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (mean_squared_error,r2_score)

In [3]:
#keras
import keras
from keras.layers import Dense
from keras.models import Sequential
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping

In [4]:
#function to calculate root mean squared error
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

### Load abalone dataset

In [5]:
column_list = ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']
abalone_df = pd.read_table("../week_17/abalone.data",sep=",",header=None,names=column_list)
abalone_copy_df = abalone_df.copy()

### Remove outliers

In [6]:
from scipy import stats
z = np.abs(stats.zscore(abalone_df.drop('Sex',axis=1)))  
threshold = 3
abalone_df = abalone_df[(z < 3).all(axis=1)]    
abalone_df

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


### Label encode Sex column

In [7]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

abalone_df['Sex_E']=le.fit_transform(abalone_df['Sex'])

# Drop sex variable after encoding:
abalone_df.drop('Sex',axis=1,inplace=True)

### train test split and standardize training data

In [8]:
#Set X as all the features except Rings and Ring label and y as target i,e Rings
X = abalone_df.drop(['Rings'], axis=1).values
#X = abalone_df[['Shell weight','Length','Height','Sex_F','Sex_I']].values 
y = abalone_df['Rings'].values

# Split into training and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42) #, stratify=y)

# #Standardize
sc= StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.fit_transform(X_test)

### Run Keras model without dimentionality reduction
- Test rmse :  1.943225067356561 Test MSE: 3.776123662402911
- R2 score :  0.5376443001183294

In [9]:

n_cols = X_train.shape[1]

model = Sequential()
model.add(Dense(units=10, input_dim=n_cols,activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(units=1,activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error',  metrics=['mae','mse'])
 
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
 
history=model.fit(X_train,y_train,batch_size=5, validation_split = 0.2, callbacks=[early_stop], epochs=100,verbose=0)
 
# Model summary for number of parameters use in the algorithm
#model.summary()

#Predict testing labels
 
y_pred= model.predict(X_test)

#print("Train rmse : ", rmse(y_train, y_pred), "Train MSE:",mean_squared_error(y_train,y_pred))
print("Test rmse : ", rmse(y_test,y_pred), "Test MSE:",mean_squared_error(y_test, y_pred))
print("R2 score : ",r2_score(y_test,y_pred))

Test rmse :  1.943225067356561 Test MSE: 3.776123662402911
R2 score :  0.5376443001183294


# Dimentionality reduction techniques - applying on abalone dataset and keras model

### 1. SVD - Singular Value Decomposition
SVD is a dimentionality reduction technique used for sparse data(with most zero values). SVD is a technique from Linear Algebra. This is also called as feature projection and the algorithms used are referred to as projection methods. Projection methods seek to reduce the number of dimensions in the feature space while also preserving the most important structure or relationships between the variables observed in the data.

Examples of sparse data approriate for applying SVD are recommender systems ,customer-product purchases, user-movie Ratings

The skearn library has TrancatedSVD library to do SVD. You can set n_components to set number of features to be transformed into.

These the values after doing SVD. There is no improvement in the performance.
- Test rmse :  2.157681384112996 Test MSE: 4.655588955347773
- R2 score :  0.4299609117034472

In [10]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=3)

X_train_svd=svd.fit_transform(X_train)
X_test_svd=svd.fit_transform(X_test)

n_cols = X_train_svd.shape[1]
input_shape = (n_cols,)

model = Sequential()
model.add(Dense(units=10, input_dim=n_cols,activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(units=1,activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error',  metrics=['mae','mse'])
 
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
 
history=model.fit(X_train_svd,y_train,batch_size=5, validation_split = 0.2, callbacks=[early_stop], epochs=100,verbose=0)
 
# Model summary for number of parameters use in the algorithm
#model.summary()
 
y_pred= model.predict(X_test_svd)
#print("Train rmse : ", rmse(y_train, y_pred), "Train MSE:",mean_squared_error(y_train,y_pred))
print("Test rmse : ", rmse(y_test,y_pred), "Test MSE:",mean_squared_error(y_test, y_pred))
print("R2 score : ",r2_score(y_test,y_pred))

Test rmse :  2.157681384112996 Test MSE: 4.655588955347773
R2 score :  0.4299609117034472


### 2. PCA - Principal Component Analysis
PCA is a a popular technique for dimentionality reduction for dense data(with few zero values). using n_components to decide number of features to be transformed into.The new transformed features are called Principal components.

A principal component is a linear combination of the original variables.Principal components are extracted in such a way that the first principal component explains maximum variance in the dataset.Second principal component tries to explain the remaining variance in the dataset and is uncorrelated to the first principal component.Third principal component tries to explain the variance which is not explained by the first two principal components and so on.

These are the values after doing PCA. There is no improvement in RMSE value.
- Test rmse :  2.1711020596125405 Test MSE: 4.7136841532538165
- R2 score :  0.4228476261521684

In [11]:
from sklearn.decomposition import PCA

pca = PCA(n_components=3)

X_train_pca=pca.fit_transform(X_train)
X_test_pca=pca.fit_transform(X_test)

n_cols = X_train_svd.shape[1]
input_shape = (n_cols,)

model = Sequential()
model.add(Dense(units=10, input_dim=n_cols,activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(units=1,activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error',  metrics=['mae','mse'])
 
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
 
history=model.fit(X_train_pca,y_train,batch_size=5, validation_split = 0.2, callbacks=[early_stop], epochs=100,verbose=0)
 
# Model summary for number of parameters use in the algorithm
#model.summary()
 
y_pred= model.predict(X_test_pca)
#print("Train rmse : ", rmse(y_train, y_pred), "Train MSE:",mean_squared_error(y_train,y_pred))
print("Test rmse : ", rmse(y_test,y_pred), "Test MSE:",mean_squared_error(y_test, y_pred))
print("R2 score : ",r2_score(y_test,y_pred))

Test rmse :  2.1711020596125405 Test MSE: 4.7136841532538165
R2 score :  0.4228476261521684


### 3.Isomap Embedding
Isomap, creates an embedding of the dataset to preserver the relationship of the dataset.

Isomap tries to get lower dimension representation of data where points maintain geodesic distance between them like original representation. Scikit-learn provides an implementation of Isomap as a part of the manifold module.We can set n_components accepts integer value specifying number of features transformed dataset will have.default=2

Wthout providing n_neighbors parameters, rmse was quiet high.
These are the values after doing Isomap. There is no improvement in the RMSE value, in fact it has increased. 
- Test rmse :  3.406520363360927 Test MSE: 11.604380985992663
- R2 score :  -0.4208622842234946

In [12]:
from sklearn.manifold import Isomap

imap = Isomap(n_neighbors=15,n_components=3)

X_train_imap=imap.fit_transform(X_train)
X_test_imap=imap.fit_transform(X_test)

n_cols = X_train_imap.shape[1]
input_shape = (n_cols,)

model = Sequential()
model.add(Dense(units=10, input_dim=n_cols,activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(units=1,activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error',  metrics=['mae','mse'])
 
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
 
history=model.fit(X_train_imap,y_train,batch_size=5, validation_split = 0.2, callbacks=[early_stop], epochs=100,verbose=0)
 
# Model summary for number of parameters use in the algorithm
#model.summary()
 
y_pred= model.predict(X_test_imap)
#print("Train rmse : ", rmse(y_train, y_pred), "Train MSE:",mean_squared_error(y_train,y_pred))
print("Test rmse : ", rmse(y_test,y_pred), "Test MSE:",mean_squared_error(y_test, y_pred))
print("R2 score : ",r2_score(y_test,y_pred))

Test rmse :  3.406520363360927 Test MSE: 11.604380985992663
R2 score :  -0.4208622842234946


### 4.Locally Linear Embedding
Locally Linear Embedding, or LLE, creates an embedding of the dataset and attempts to preserve the relationships between neighborhoods in the dataset.

LLE tries to find the lower-dimensional projection of data while maintaining distances within local neighborhood points. It can be viewed as applying a series of local PCAs that are then compared globally to find the most suited non-linear embedding.  Scikit-learn provides an estimator named LocallyLinearEmbedding as a part of the manifold module for performing Locally Linear Embedding on data. 

Wthout providing n_neighbors parameters, rmse was quiet high.
These are the values after doing Locally Linear Embedding. There is no improvement in the RMSE value, in fact it has increased. 
- Test rmse :  3.409951787183732 Test MSE: 11.627771190917528
- R2 score :  -0.42372622500914403

In [13]:
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_neighbors=15,n_components=3)

X_train_lle=lle.fit_transform(X_train)
X_test_lle=lle.fit_transform(X_test)

n_cols = X_train_lle.shape[1]
input_shape = (n_cols,)

model = Sequential()
model.add(Dense(units=10, input_dim=n_cols,activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(units=1,activation='linear'))
model.compile(optimizer='adam', loss='mean_squared_error',  metrics=['mae','mse'])
 
early_stop = keras.callbacks.EarlyStopping(monitor='val_loss', patience=10)
 
history=model.fit(X_train_lle,y_train,batch_size=5, validation_split = 0.2, callbacks=[early_stop], epochs=100,verbose=0)
 
# Model summary for number of parameters use in the algorithm
#model.summary()
 
y_pred= model.predict(X_test_lle)
#print("Train rmse : ", rmse(y_train, y_pred), "Train MSE:",mean_squared_error(y_train,y_pred))
print("Test rmse : ", rmse(y_test,y_pred), "Test MSE:",mean_squared_error(y_test, y_pred))
print("R2 score : ",r2_score(y_test,y_pred))


Test rmse :  3.409951787183732 Test MSE: 11.627771190917528
R2 score :  -0.42372622500914403


## 2. Write a function that will indicate if an inputted IPv4 address is accurate or not.
IP addresses are valid if they have 4 values between 0 and 255 (inclusive), punctuated
by periods.
- Input 1:
    - 2.33.245.5
- Output 1:
    - True

- Input 2:
    - 12.345.67.89
- Output 2:
    - False

In [14]:
def isIPv4Valid(ipv4):
    #Split the string using period
    ipv4_split = ipv4.split(".")
    
    #This takes care of if string has any other character than period then length of list after split will not be 4
    if len(ipv4_split) == 4:
        true_count = 0
        for i in ipv4_split:
            #Check if value is between 0 and 255 (inclusive)
            if int(i)>=0 and int(i)<=255:
                #increase true count by 1
                true_count = true_count + 1
                #print(i,true_count)
        #If all the 4 values are between 0 and 255 (inclusive)
        if true_count==4:
            print("True")
        else:
            print("False")
    else:
        print("False")
    


In [15]:
isIPv4Valid("2.33.245.5")
isIPv4Valid("12.345.67.89")
isIPv4Valid("212.45.67.189")
isIPv4Valid("1.45.567.0")
isIPv4Valid("0.0.0.0")
isIPv4Valid("456.768.656.434")
isIPv4Valid("1-35.78.254")
isIPv4Valid("0-21_134*42")

True
False
True
False
True
False
False
False
