# Import Packages

In [None]:

import ? as np # ? numpy
import pandas as pd

import ? as plt # ? matplotlib.pyplot
import seaborn as sns
%matplotlib inline

#library for PCA (new library imported for PCA)
from ? import PCA # ? sklearn.decomposition
#library for Standardization
from sklearn import ? # ? preprocessing

# Dataset: Breakfast Cereals
Data were collected on the nutrional information and consumer rating of 77 breakfast cereals. The data are available [here](http://lib.stat.cmu.edu/datasets/1993.expo/). The consumer rating is a rating of cereal “healthiness” for consumer information (not a rating by consumers). For each cereal, the data include 13 numerical variables, and we are interested in reducing this dimension. For each cereal, the information is based on a bowl of cereal rather than a serving size, because most people simply fill a cereal bowl (resulting in constant volume, but not weight). 

## Description of the Variables in the Breakfast Cereal Dataset

**Variable: Description**
* mfr: Manufacturer of cereal (American Home Food Products, General Mills, Kellogg, etc.)
* type: Cold or hot
* calories: Calories per serving
* protein: Grams of protein
* fat: Grams of fat
* sodium: Milligrams of sodium
* fiber: Grams of dietary fiber
* carbo: Grams of complex carbohydrates
* sugars: Grams of sugars
* potass: Milligrams of potassium
* vitamins: Vitamins and minerals: 0, 25, or 100, indicating the typical percentage of FDA  recommended
* shelf: Display shelf (1, 2, or 3, counting from the floor)
* weight: Weight in ounces of one serving
* cups: Number of cups in one serving
* rating: Rating of the cereal calculated by consumer reports



In [None]:
cereals_df = pd.read_csv(?) # ? 'cereal.csv'
cereals_df.head()   # ? 10

## Summary Statistics

In [None]:
cereals_df.? # ? describe() can be used to check skewness

In [None]:
# what about categorical variables?

cereals_df[?].describe() # ? 'mfr'   

In [None]:
# how to check difference in mean, median etc by each category?

cereals_df.groupby('mfr').median() # to distinguish between different category

In [None]:
# Is there any similar catgories based on mean, median?

cereals_df[cereals_df['mfr'].isin([?])].groupby('mfr').mean() # ? 'R','G'

### Visualizing the underlying probability density function

In [None]:
# How does the underlying pdf look for each continous variable?

sns.?(cereals_df, x="calories", kind="kde") # distplot
# sns.displot(cereals_df, x="protein", kind="kde")

# Is there skewness?
sns.displot(cereals_df[?],kind='kde') # ? 'fiber', mean larger than median (right skewed)

# Reference: https://seaborn.pydata.org/tutorial/distributions.html

## Correlation Analysis

In [None]:
# to apply correlation 1. columns should be numeric 2. there should be no missing value
# let's check

cereals_df.? # ? info()

In [None]:
#To apply correlation

# cereals_df.iloc[:,3:] will select all rows but your first 3 columns will be excluded.
temp_df = cereals_df.iloc[:,3:].dropna(axis=0) # 1. Remove first three columns since they are of type object; and 2. drop any row with missing records
temp_df.head()


In [None]:
#To generate correlation Matrix

cormat = temp_df.? # ? corr()
round(cormat,2) # round to two decimal places

In [None]:
# How to generate correlation Heatmap?

sns.?; # ? heatmap(cormat)

# PCA

## Covariance for two variables

In [None]:
#Take only two variables

X = cereals_df[[?]] # ? 'calories','rating'
X.head()

In [None]:
#Covariance Matrix

np.cov(X['calories'],X['rating']) 

# Whats the difference between correlation and covariance?

#Correlation Coefficient

#np.corrcoef(X['calories'],X['rating']) 



In [None]:
(379.63)/577 # 66% of variance is explained if we keep 'Calories' alone.

In [None]:
#Scatterplot

sns.?(data = X, x = 'calories', y = 'rating', s = 70) # ? scatterplot

## PCA with only 2 components to start with

In [None]:
#PCA with two components

pcs  = ?(n_components = 2) # ? PCA, Step 1. load the algorithm

pcs.?(X) # fit, Step 2. fit the data

In [None]:
# What amount of variance is explained by the principal component?

pcs.explained_variance_  # The amount of variance explained by each of the selected components.

In [None]:
# In ratio?

pcs.? # ? explained_variance_ratio_ Percentage of variance explained by each of the selected components.

In [None]:
498.02447768/(498.02447768+78.93273879) # explained variance by first principal component

In [None]:
#PCS Summary

#explained_variance_: The amount of variance explained by each of the selected components. 
#explained_variance_ratio_: Percentage of variance explained by each of the selected components.

pcsSummary = pd.DataFrame({'Std. Deviation' : np.sqrt(pcs.explained_variance_), 
                           'Proportion of Variance': pcs.explained_variance_ratio_,
                           'Cumulative Proportion': np.cumsum(pcs.explained_variance_ratio_)})

#pcsSummary
pcsSummary = pcsSummary.transpose()

pcsSummary.columns = ['PC1', 'PC2']

pcsSummary.round(2)

In [None]:
pcs.?  # ? components_   Weights of the princpal components, used to transform the raw data (changing the reference axis)

## Further Analysis of PCA based on only two components

In [None]:
#PCS Weights

#components_: Principal axes in feature space, representing the directions of maximum variance in the data. 

pcsComponents_df = pd.DataFrame(pcs.components_.transpose(), 
                                columns = ['PC1', 'PC2'],
                                index = ['calories', 'rating'])
print(pcsComponents_df)
print("...")


In [None]:
X.head() # how the raw data looks

In [None]:
X.mean() 

In [None]:
# Values of PC1 for first observation

# pc1_weight_cal * (X - Xbar) + pc1_weight_rat * (Y - ybar)
print((-0.847053*(70-106.88)) + (0.531508*(68.40-42.66)))

In [None]:
# Values of PC2 for first observation

# pc2_weight_cal * (X - Xbar) + pc2_weight_rat * (Y - ybar)
print((0.531508*(70-106.88)) + (0.847053*(68.40-42.666)))

In [None]:
#transform: Apply dimensionality reduction to all observations under X.

scores = pd.DataFrame(pcs.transform(cereals_df[['calories','rating']]),
                      columns = ['PC1','PC2'])
scores.head() # displaying transformed values of first five observations under X

In [None]:
#Check the variance

print(np.var(X['calories']) + np.var(X['rating'])) # Total variance for raw data (calories,ratings) 
print(np.var(scores['PC1']) + np.var(scores['PC2'])) # Total variance for transformed data (PC1,PC2)

# Total variance is same for raw data (calories,ratings) and transformed data (PC1,PC2) i.e. 569.46

In [None]:
# But note 86% variance is explained by PC1, compared to 66% of 'Calories' alone!

print(np.var(X['calories'])/(np.var(X['calories']) + np.var(X['rating'])))  # % of total variance explained by calories alone 

print(np.var(scores['PC1'])/(np.var(scores['PC1']) + np.var(scores['PC2']))) # % of total variance explained by PC1 alone 

## PCA for Full Dataset 
(PCA applicable for only numerical data and only independent variables i.e. X)

In [None]:
pcs  = ? # ? PCA()
pcs.?(cereals_df.iloc[:,3:].dropna(axis=0)) # ? fit, dropped first 3 columns, since PCA cannot be applied on categorical data
pcsSummary_df = pd.DataFrame({'Std. Deviation' : np.sqrt(pcs.explained_variance_),
                           'Proportion of Variance': pcs.explained_variance_ratio_,
                           'Cumulative proportion': np.cumsum(pcs.explained_variance_ratio_)})
pcsSummary_df = pcsSummary_df.transpose()

pcsSummary_df.columns = ['PC{}'.format(i) for i in range(1, len(pcsSummary_df.columns)+1)] # Setting the column names by number of components
print(pcsSummary_df.round(2))


In [None]:
# Screeplot - plot variance explained component wise

PC_values = np.arange(pcs.n_components_) + 1
plt.plot(PC_values, pcs.explained_variance_ratio_, 'o-', linewidth=2)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

In [None]:
# Cumulative Explained Variance Ratio plot

plt.plot(PC_values, np.cumsum(pcs.explained_variance_ratio_), 'o-',linewidth=2)
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
#Now lets see the weights of first five PCs! 

pcsComponents_df = pd.DataFrame(pcs.components_.transpose(), 
                                columns = pcsSummary_df.columns,
                                index = cereals_df.iloc[:,3:].columns)
print(pcsComponents_df.iloc[:,:]) # ? 0:5

## May Further Explore
### What are the important factors contributing to PC1, PC2...Does that make sense?
### Normalisation (or Standardisation)
### Linear Regression
### Visualization of PCA