In [1]:
import numpy as np
import pandas as pd

In [2]:
a = np.random.normal(size = 1000)
b = np.random.normal(10,11,size = 1000)
c = np.random.normal(12,14,size = 1000)
d = np.random.normal(23,34,size = 1000)
e = np.random.normal(4,5,size = 1000)
f = np.random.normal(5,6,size = 1000)

In [3]:
df = pd.DataFrame({'a':a,
                    'b':b,
                    'c':c,
                    'd':d,
                    'e':e,
                    'f':f,
                  })

In [4]:
class CalculatePCA:
    """PCA algorithm using eigen vectors and eigen values"""
    import numpy as np
    import pandas as pd
    
    def __init__(self):
        """Principal Component Analysis is an unsupervised learning algorithm that is used 
        for the dimensionality reduction in machine learning. It is a statistical process
        that converts the observations of correlated features into a set of linearly uncorrelated
        features with the help of orthogonal transformation."""
        pass
      
    def __Standardize(self, df):
        """Standardize the columns, where variance = 1, mean = 0"""
        for col in df.columns:
            column = df[col]
            colMean = column.mean()
            n = df.shape[0]
            stdDev = sum(((column - colMean)**2)/n)
            df[col] = (column - colMean)/stdDev
        return df
        
    def __corrMatrix(self,df):
        """Returns the correlation matrix for the given dataset"""
        df = self.__Standardize(df)
        cols = {}
        n = df.shape[0]
        for x in df.columns:
            rows = []
            xBar = df[x].mean()
            xVal = df[x]
            for y in df.columns:
                yBar = df[y].mean()
                yVal = df[y]
                xSol = xVal - xBar
                ySol = yVal - yBar
                Cov = (sum(xSol.T*ySol))/n
                rows.append(Cov)
            cols[x] = rows
        return pd.DataFrame(cols)
    
    def __returnEigens(self,df):
        """return the eigen values and eigen vectors for the given dataset"""
        df = self.__corrMatrix(df)
        eigenValue, eigenVector = np.linalg.eig(df)
        return pd.DataFrame({"EigenValue":eigenValue,
                             "EigenVector":list(eigenVector)
            
        })
    
    def __VarianceExplained(self, df):
        """This method returns a dataframe with variance explained by the eigen vectors in percentage"""
        df = self.__returnEigens(df)
        Variance = []
        for i in df["EigenValue"]:
            explainedVariance = (i/sum(df["EigenValue"]))*100
            Variance.append(explainedVariance)
        df = pd.DataFrame({"EigenValue":df["EigenValue"].values,
                            "ExplainedVariance": Variance,
                           "EigenVectors": df["EigenVector"]
                            })
        df.sort_values(by = "ExplainedVariance", ascending=False,inplace=True)
        return df
        
    def PCAbyVarianceExplained(self, df, varExplained = 95):
        """we can pass varExplained variable,
        it will return the variance explained by n_components equal to or just next value"""
        df = self.__VarianceExplained(df)
        sumVariance = 0
        flag = 0
        for i in df["ExplainedVariance"].values:
            sumVariance += i
            flag += 1
            if sumVariance >= varExplained:
                break
        df = df[0:flag]
        x = np.array([np.array(val) for val in df["EigenVectors"].values]).T
        print("The variance explained is,",round(df["ExplainedVariance"].sum(),2),
              "\nComponents:", df["ExplainedVariance"].to_list())
        return pd.DataFrame(x)
        
    
    def PCAbyNComponents(self, df, n_components):
        """This is standard PCA like method, it will return the dataframe object with number of components passed"""
        df = self.__VarianceExplained(df)[0:n_components]
        x = np.array([np.array(val) for val in df["EigenVectors"].values]).T
        print("The variance explained is,",round(df["ExplainedVariance"].sum(),2),
              "\nComponents:", df["ExplainedVariance"].to_list())
        return pd.DataFrame(x)
    
    def __str__(self):
        return """THis is a generic algorithm for PCA with some added advantage
        1. It will show the variance explained by the number of components taken,
        2. we can pass the value, how much variance we want to retain and it will give an dataframe object
        of the values"""
    


In [5]:
pca = CalculatePCA()
print(pca)

THis is a generic algorithm for PCA with some added advantage
        1. It will show the variance explained by the number of components taken,
        2. we can pass the value, how much variance we want to retain and it will give an dataframe object
        of the values


In [6]:
pca.PCAbyNComponents(df, 3)

The variance explained is, 98.65 
Components: [92.35010401666058, 3.8547086505722117, 2.4454062088687225]


Unnamed: 0,0,1,2
0,-0.99993,-0.004836,0.000126
1,-0.010772,0.020039,0.003
2,-0.001137,-0.003615,-0.012497
3,-0.004608,0.997125,0.072072
4,-0.000512,-0.010591,0.010773
5,0.000483,-0.072053,0.997258


In [7]:
# comparing the components with PCA from sklearn.decomposition
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit_transform(df)
pca.explained_variance_ratio_*100

array([92.35010402,  3.85470865,  2.44540621])

# Testing the same on a dummy data

In [8]:
df = pd.read_csv("https://raw.githubusercontent.com/dsrscientist/dataset1/master/winequality-red.csv")
df.drop('quality', axis = 1, inplace = True)
df.dropna(axis=0, inplace = True)

In [9]:
pca = CalculatePCA()
newData = pca.PCAbyNComponents(df, 3)

The variance explained is, 99.98 
Components: [99.80107167777265, 0.1565141868178203, 0.021007765250490574]


In [10]:
# comparing the components with PCA from sklearn.decomposition
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
pca.fit_transform(df)
vals =[round(i, 4) for i in pca.explained_variance_ratio_*100]
vals

[99.8011, 0.1565, 0.021]