# Performing PCA to check which factor impacts the Crop Yield most

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
data = pd.read_csv('Final.csv')
data.head()

Unnamed: 0,S.No,Crop,District,Year,Min Temp,Max. Temp,Total Rainfall,Total Yield
0,1,bengal gram,ADILABAD,2016,12.34,42.66,1130.15,1551
1,4,groundnut,ADILABAD,2016,12.34,42.66,1130.15,2068
2,7,maize,ADILABAD,2016,12.34,42.66,1130.15,5712
3,10,bengal gram,BHADRADRI,2016,16.42,42.65,1145.008696,0
4,13,groundnut,BHADRADRI,2016,16.42,42.65,1145.008696,2186


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 279 entries, 0 to 278
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   S.No            279 non-null    int64  
 1   Crop            279 non-null    object 
 2   District        279 non-null    object 
 3   Year            279 non-null    int64  
 4   Min Temp        279 non-null    float64
 5   Max. Temp       279 non-null    float64
 6   Total Rainfall  279 non-null    float64
 7   Total Yield     279 non-null    int64  
dtypes: float64(3), int64(3), object(2)
memory usage: 17.6+ KB


In [4]:
col = ['Min Temp','Max. Temp','Total Rainfall','Total Yield']
scaler=StandardScaler()

data[col]=pd.DataFrame(scaler.fit_transform(data[col]),columns=data[col].columns)

In [5]:
data = data.drop(['S.No','District','Year','Total Yield'], axis = 1)

In [6]:
ordinal_encoder = OrdinalEncoder()
data['Crop'] = ordinal_encoder.fit_transform(data[['Crop']])

In [7]:
data.head()

Unnamed: 0,Crop,Min Temp,Max. Temp,Total Rainfall
0,0.0,-1.709566,1.222052,1.824323
1,1.0,-1.709566,1.222052,1.824323
2,2.0,-1.709566,1.222052,1.824323
3,0.0,1.074937,1.212156,1.895889
4,1.0,1.074937,1.212156,1.895889


## PCA

In [8]:
def PCA_lib(df):
    pca = PCA(df.shape[1])
    pca.fit(df)
    eigenVectors = pca.components_
    eigenValues = pca.explained_variance_
    components = pca.transform(df)
    return components, eigenValues, eigenVectors

In [9]:
def evaluate(components, eigenValues, eigenVectors, df):
    weightage = {}
    
    col = ['PC'+ str(i+1) for i in range(components.shape[1])]
    transform = pd.DataFrame(components, columns = col)
    display(transform)
    
    var = eigenValues/sum(eigenValues)
    for i in range(1,len(var)+1):
        if sum(var[:i])>=0.85:
            no_of_pc = i
            break
    print(f'\n{no_of_pc} components explains 85% or more variability')
    
    for i in range(no_of_pc):
        top_n = np.argsort( abs(eigenVectors[i]) )[::-1][:4] # shows top 4 affecting features
        weightage['PC'+str(i+1)] = [df.columns[i] for i in top_n]
    
    impact = pd.DataFrame.from_dict(weightage, orient='index')
    impact['Variability (%)'] = var[:no_of_pc]
    impact['Cummulative Variability (%)'] = [sum(var[:i]) for i in range(1,no_of_pc+1)]
    display(impact)
    return transform, impact

In [10]:
components, eigenValues, eigenVectors = PCA_lib(data)

In [11]:
transform, impact = evaluate(components, eigenValues, eigenVectors, data)

Unnamed: 0,PC1,PC2,PC3,PC4
0,2.605120,-0.939483,-1.0,-0.273686
1,2.605120,-0.939483,0.0,-0.273686
2,2.605120,-0.939483,1.0,-0.273686
3,1.726454,1.676043,-1.0,-0.655319
4,1.726454,1.676043,0.0,-0.655319
...,...,...,...,...
274,0.043094,0.155613,0.0,-0.307795
275,0.043094,0.155613,1.0,-0.307795
276,-1.594785,-0.218670,-1.0,0.044601
277,-1.594785,-0.218670,0.0,0.044601



3 components explains 85% or more variability


Unnamed: 0,0,1,2,3,Variability (%),Cummulative Variability (%)
PC1,Total Rainfall,Max. Temp,Min Temp,Crop,0.468911,0.468911
PC2,Min Temp,Max. Temp,Total Rainfall,Crop,0.250953,0.719863
PC3,Crop,Total Rainfall,Max. Temp,Min Temp,0.181818,0.901682
