In [None]:
!pip install cufflinks plotly 
!pip install plotly
!pip install chart_studio 

In [None]:
import pandas as pd    # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np     # linear algebra
import seaborn as sb
import copy

import plotly
import plotly.express as px
import chart_studio.plotly as py
import matplotlib.pyplot as plt

In [None]:
import chart_studio
chart_studio.tools.set_credentials_file(username='cavanferns', api_key='wvTuwmntu6XCYiSG2pKx')

In [None]:
edgr_data = pd.read_csv("../input/Edible_grains.csv") 
print(edgr_data.shape)

In [None]:
edgr_data.head(5)

### Fields in the dataset:

- grain_name: Name of the edible grains.
- Producer: Manufacturer of the grains.
    - A = Agra Food Products
    - G = Green Light Foods
    - K = Kellog’s Food Products.
    - N = Nestle Products.
    - P = Periyar Food Products
    - Q = Quaker Oats
    - R = Ran Impex Inc.
- Variety: 
    - Cool
    - Hot
- Calories_content: calories per serving
- Protein_content: protein content in grains in grams
- Fat_content: fat_content in grams.
- Hydrated_carbon: Carbohydrates present in grains in grams
- Sugars_content: grams of sugar
- Potassium content: milligrams of potassium
- Vit_&_min: Vitamins and minerals – 0, 25, or 100, indicating the typical percentage of FDA recommended.
- Weight: weight in grams of one serving
- Cups: number of cups in one serving
- Rating: A rating of the cereal (Possibly from Consumer Reports)


#### Problem Statements:

- Bagaorys India Limited, a Fast–moving consumer goods company, manufactures a high–fiber breakfast cereals and health foods. They wish to study the amount of nutrients the edible grains so that they can recommend the appropriate grains to the specified customers based on the ratings. 

- Our goal here is to find which products and producers actually produce healthy and highly rated cereals.

- Average rating and number of products sold by each producer.

- Find which products could be intensely marketed to consumers other than the rating factor so that they can make a profitable income and give a chance to their other products.

- Also which demographic the company target, to make a profitable income.

- Build a model that might help the company predict the future ratings of a new product based on its nutritional value so that the company can decide whether they should produce it or not.

## Data Cleaning

In [None]:
edgr_data.info()

In [None]:
edgr_data.columns

In [None]:
# To find if there are negative values in the dataset:

num = edgr_data._get_numeric_data()
np.sum((num < 0).values.ravel())

In [None]:
# Convert negative values to 0:

num[num < 0] = 0
# num.head(10)

In [None]:
# Update the existing database with the new one(removed the negative values and replaced with 0)

edgr_data.update(num)

num = edgr_data._get_numeric_data()
np.sum((num < 0).values.ravel())

# edgr_data.head(10)

 ## Exploratory Data Analysis:

In [None]:
edgr_data.head(10)

### Cold vs Hot Cereals:

   - RTE (ready-to-eat) cold cereal is any cereal (eg, corn flakes, shredded wheat, toasted oat cereal) that is usually consumed dry or with dairy/non-dairy milk. 
   
   - Hot cereal is defined as cereal that must be cooked (on the stovetop or in the microwave oven) before eating, including oatmeal, instant oatmeal, hot wheat, and other grain products. 

In [None]:
edgr_data.Variety.value_counts()

In [None]:
cool = edgr_data.Variety.str.contains('C')

In [None]:
cool1 = len(edgr_data[cool])
hot = (len(edgr_data['Variety'])) - len(edgr_data[cool])

In [None]:
import plotly.graph_objects as go

labels = ['Hot Cereals', 'Cool Cereals']
values = [3,74]

# pull is given as a fraction of the pie radius
fig = go.Figure(data = [go.Pie(labels = labels, values = values, pull = [0.2, 0])])
fig.update(layout_title_text='Hot v/s Cool Cereals')
fig.show()

- The market is dominated by Cool cereals as it is easier to prepare and most of them are much tastier for a morning breakfast.
- Also, most cereal producers produce cool cereals as they target the younger demographic because it is sweet and easier to be prepared by busy parents.

### Products from each Producer:

In [None]:
product_count = pd.DataFrame(edgr_data['Producer'].value_counts(dropna = False).reset_index())
product_count.columns = ['Producer', 'Number of Products']
# product_count
# edgr_data['Producer'].value_counts()

product_count["Producer"].replace({"K": "Kellogs", "G": "Green Light Foods",
                                  "P":"Periyar Foods", "R":"Ran Impex Inc",
                                  "Q":"Quaker Oats", "N":"Nestle Products",
                                  "A":"Agra Foods"}, inplace=True)

product_count

In [None]:
# Visualization of number of products sold per Producer:

import plotly.graph_objects as go

x = product_count['Producer']
y = product_count['Number of Products']

# Use the hovertext kw argument for hover text
fig = go.Figure(data = [go.Bar(x = x, y = y, hovertext = ['Kellogs sells 23 products', 
                                                          'Green Light Foods sells 22 products',
                                                         'Periyar Food Products sells 9 products',
                                                         'Ran Impex Inc sells 8 products',
                                                         'Quaker Oats sells 8 products',
                                                         'Nestle Products sells 6 products',
                                                         'Agra Food Products sells 1 product'], )])

# Customize aspect
fig.update_traces(marker_color='RGB(163,102,210)', marker_line_color='RGB(170,73,195)', marker_line_width=0.5, opacity=0.7)
fig.update_layout(title_text = 'Number of products sold per Producer')
fig.show()

- Kellogs and Green Light Foods produces a wide lineup of products while the other producers have a limited product line.
- This could indicate that Kellogs and Green Light Foods have a wide network of production units that populate the market with their products which can in turn catch the eyes of the daily consumer.

### Average ratings of products from each Producer

In [None]:
producer_rating = edgr_data.groupby('Producer')['ratings'].mean().reset_index()
producer_rating.columns = ['Producer', 'Average Rating']
# producer_rating
producer_rating["Producer"].replace({"K": "Kellogs", "G": "Green Light Foods",
                                  "P":"Periyar Foods", "R":"Ran Impex Inc",
                                  "Q":"Quaker Oats", "N":"Nestle Products",
                                  "A":"Agra Foods"}, inplace=True)
producer_rating

In [None]:
# Visualization of product ranking:

fig = px.bar(producer_rating, x = 'Producer', y = 'Average Rating')
fig.show()

- While Nestle only produces just 6 products it has the highest rating amongst others.

### Ranking the products based on their ratings

In [None]:
edgr_data['Rank'] = edgr_data['ratings'].rank(ascending = False) 
edgr_data = edgr_data.set_index('Rank')
# edgr_data.head(10)
edgr_data = edgr_data.sort_index()
edgr_data.head(10)

In [None]:
# Visualization of product ranking

edgr_data = edgr_data.sort_values(['ratings'],ascending = False).reset_index(drop = True)
plt.figure(figsize = (20,26))
sb.barplot(x = edgr_data["ratings"], y = edgr_data["grain_name"])
plt.xlabel("Ratings", fontsize = 15)
plt.ylabel("Product names", fontsize = 15)
plt.title("Product Ratings", fontsize = 20)
plt.show()

- The highest Rated product is All-Bran with Extra Fiber produced by Kellogs beating the products by a huge margin.
- If observed, it can be noticed that there are many low-rated products. The company should take this into consideration that in the course of time these products could yeild a loss. The product rating and profit should be carefully monitered.

### Heat Map

In [None]:
# Compute the correlation matrix
corr = edgr_data.iloc[:,~edgr_data.columns.isin(['Rank','name','producer','Variety','weight', 'cups'])].corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize = (16, 12))

# Generate a custom diverging colormap
cmap = sb.diverging_palette(220, 10, as_cmap = True)

# Draw the heatmap with the mask and correct aspect ratio
sb.heatmap(corr, mask = mask, cmap = cmap, vmax =.3, center = 0,
            square = True, linewidths = .5, cbar_kws = {"shrink": .5})

- In the Heat-map displayed above we see that the ratings of the product are inversely proportional to the calories and sugar content.

### To find out how many of these cereals are actually good or bad

- Daily reference intakes for adults are:

    - Energy: 8,400kJ/2,000kcal
    - Total fat: less than 70g
    - Saturates: less than 20g
    - Carbohydrate: at least 260g
    - Total sugars: 90g
    - Protein: 50g
    - Salt: less than 6g
    
reference_link : https://www.nhs.uk/live-well/eat-well/what-are-reference-intakes-on-food-labels/

- Fat:
    - High in fat: more than 17.5g of fat per 100g
    - Low in fat: 3g of fat or less per 100g

- Sugars:
    - High in sugars: more than 22.5g of total sugars per 100g
    - Low in sugars: 5g of total sugars or less per 100g
    
- Salt:
    - High in salt: more than 1.5g of salt per 100g
    - Low in salt: 0.3g of salt or less per 100g

In [None]:
health_edgr_data = edgr_data

In [None]:
# creating a new Dataframe with a new column: 'Healthy : (Y/N)' :

health_edgr_data['Healthy'] = np.where((edgr_data['sugars_content'] <= 5) 
                               & (edgr_data['sodium_content'] <= 0.3) 
                               & (edgr_data['fat_content'] <= 3)  
                               &(edgr_data['calories_content'] >= 50), 'Y','N')

health_edgr_data['Healthy'].value_counts()

In [None]:
health_edgr_data['Rank'] = health_edgr_data['ratings'].rank(ascending = False) 
health_edgr_data = health_edgr_data.set_index('Rank')
# edgr_data.head(10)
health_edgr_data = health_edgr_data.sort_index()
# health_edgr_data.head()
health_edgr_data[['grain_name', 'ratings', 'Healthy']]

- An interesting note is that the highest rated cereal produced by Kellogs has an unhealthy amounts of salts, calories and sugars, might not make it the healthiest choice.

### Cool and Healthy!

In [None]:
cool_healthy = health_edgr_data

# choosing rows based on 'cool' Variety 
cool = cool_healthy["Variety"] == "C"
  
# choosing rows based on healthy option
healthy = cool_healthy["Healthy"] == "Y"
  
# filtering data on basis of both filters 
cool_healthy.where(cool & healthy, inplace = True) 
cool_healthy = cool_healthy.dropna()
# display

cool_healthy[['grain_name', 'Variety', 'ratings', 'Healthy']]

- While the highest rated product speculated to be an unhealthy option, the company could take an initiative to market other options: whose variety is cool, has high ratings, has healthy amounts of nutrition.
- Specifically, 
    - Shredded Wheat 'n'Bran 
    - Shredded Wheat spoon size
    - Shredded Wheat 
    - Puffed Wheat 
    - Puffed Rice	
    produced by Nestle and Quaker meet these requirements.

## Data Normalization and Outlier Treatment

In [None]:
edgr_data.drop(columns=['Rank'])

In [None]:
# %matplotlib inline

# edgr_data2 = edgr_data
# notnorm = edgr_data2.drop(columns=['grain_name', 'Producer', 'Variety', 'Healthy'])

# plt.figure(figsize=(16,6))
# plt.ylim(0,0.70)

# sb.kdeplot(notnorm['calories_content'])
# sb.kdeplot(notnorm['protein_content']) 
# sb.kdeplot(notnorm['fat_content'])
# sb.kdeplot(notnorm['sodium_content'])
# sb.kdeplot(notnorm['fiber_content'])
# sb.kdeplot(notnorm['hydrated_carbon'])
# sb.kdeplot(notnorm['sugars_content'])
# sb.kdeplot(notnorm['potassium_content'])
# sb.kdeplot(notnorm['vit_&_min'])

- Need of Normalization:
    - Normalization is generally required when we are dealing with attributes on a different scale, otherwise, it may lead to a dilution in effectiveness of an important equally important attribute(on lower scale) because of other attribute having values on larger scale.
    - In simple words, when multiple attributes are there but attributes have values on different scales, this may lead to poor data models while performing data mining operations. So they are normalized to bring all the attributes on the same scale.

- In this case we normalise the data because there are different scales of comparision, viz: weight, shelf, cup and the nutritional content.

### Finding and Capping outliers

#### Outlier Detection:

In [None]:
#For all coulmns find outliers

def outlierCount(data):    #function to count the outliers in a data frame
    dataNum = data._get_numeric_data()
    outlierCountDF = pd.DataFrame()
    for (columnName, columnData) in dataNum.iteritems():
        q1 = columnData.quantile(0.25)
        q3 = columnData.quantile(0.75)
        iqr = q3 - q1
        UB = q3 + 1.5 * iqr
        LB = q1 - 1.5 * iqr
        outlierCount = columnData[~columnData.between(LB,UB)].count()
        outlierCountDF = outlierCountDF.append([[columnName,outlierCount]],ignore_index = True)
    outlierCountDF.columns = ['Variable','Outlier Count']
    return outlierCountDF

outlierCount(edgr_data)    #function call passing the data

In [None]:
from plotly.offline import init_notebook_mode, iplot

trace0 = go.Box(y = edgr_data['calories_content'], name = 'calories', marker = dict(color = 'rgb(214, 12, 140)'))
trace1 = go.Box(y = edgr_data['protein_content'], name = 'protein', marker = dict(color = 'RGB(255,101,80)'))
trace2 = go.Box(y = edgr_data['sodium_content'], name = 'sodium', marker = dict(color = 'RGB(255,169,80)'))
trace3 = go.Box(y = edgr_data['fiber_content'], name = 'fiber', marker = dict(color = 'RGB(111,169,80)'))
trace4 = go.Box(y = edgr_data['hydrated_carbon'], name = 'hydrated carbon', marker = dict(color = 'RGB(111,169,177)'))
trace5 = go.Box(y = edgr_data['potassium_content'], name = 'potassiun', marker = dict(color = 'RGB(111,59,177)'))
trace6 = go.Box(y = edgr_data['vit_&_min'], name = 'vit and min', marker = dict(color = 'RGB(215,118,92)'))
trace7 = go.Box(y = edgr_data['weight'], name = 'weight', marker = dict(color = 'RGB(37,134,221)'))
                      
data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(title = "Visualization of Outliers:", plot_bgcolor= 'rgba(0, 0, 0, 0.20)')

fig = go.Figure(data = data, layout = layout)
py.iplot(fig)

#### Capping outliers:

- Values above UB replaced with 99% value
- Values below LB replaced with 1% value`

In [None]:
edgr_data_num = edgr_data._get_numeric_data()

In [None]:
# function to cap the outliers:

def outliercap(data):
    data_num = data._get_numeric_data()
    for column in data_num:            
        q1 = data_num[column].quantile(0.25)    #q1
        q3 = data_num[column].quantile(0.75)    #q3
        iqr = q3 - q1
        ub = q3 + 1.5 * iqr
        lb = q1 - 1.5 * iqr
        
        data_num[column] = data_num[column].replace(data_num[data_num[column] > ub][column], q3)
        data_num[column] = data_num[column].replace(data_num[data_num[column] < lb][column], q1)
    return data_num

# market_data2 = copy.deepcopy(market_data)
capped_outliers = outliercap(edgr_data)
# capped_outliers.describe()

In [None]:
# Visualization of Outliers after Capping the values:

trace0 = go.Box(y = capped_outliers['calories_content'], name = 'calories', marker = dict(color = 'rgb(214, 12, 140)'))
trace1 = go.Box(y = capped_outliers['protein_content'], name = 'protein', marker = dict(color = 'RGB(255,101,80)'))
trace2 = go.Box(y = capped_outliers['sodium_content'], name = 'sodium', marker = dict(color = 'RGB(255,169,80)'))
trace3 = go.Box(y = capped_outliers['fiber_content'], name = 'fiber', marker = dict(color = 'RGB(111,169,80)'))
trace4 = go.Box(y = capped_outliers['hydrated_carbon'], name = 'hydrated carbon', marker = dict(color = 'RGB(111,169,177)'))
trace5 = go.Box(y = capped_outliers['potassium_content'], name = 'potassiun', marker = dict(color = 'RGB(111,59,177)'))
trace6 = go.Box(y = capped_outliers['vit_&_min'], name = 'vit and min', marker = dict(color = 'RGB(215,118,92)'))
trace7 = go.Box(y = capped_outliers['weight'], name = 'weight', marker = dict(color = 'RGB(37,134,221)'))
                      
data = [trace0, trace1, trace2, trace3, trace4, trace5, trace6, trace7]
layout = go.Layout(title = "Visualization of Outliers after capping:", plot_bgcolor= 'rgba(0, 0, 0, 0.20)')

fig = go.Figure(data = data, layout = layout)
py.iplot(fig)

In [None]:
capped_outliers.shape

In [None]:
# Update and create a new dataframe with the capped outliers:

edgr_data_capped = edgr_data.drop(columns = ['Healthy','Rank'])
edgr_data_capped.update(capped_outliers)

### Data Normalization

In [None]:
mlm_data = edgr_data_capped._get_numeric_data()

In [None]:
from sklearn.preprocessing import Normalizer

norm = Normalizer()

norm_data = norm.fit_transform(mlm_data)
norm_data = pd.DataFrame(norm_data)
norm_data.columns = mlm_data.columns
norm_data.head()

In [None]:
norm_data.describe()

In [None]:
# Visualization of Normalised data:

plt.figure(figsize=(20,6))


sb.kdeplot(norm_data['calories_content'])
sb.kdeplot(norm_data['protein_content']) 
sb.kdeplot(norm_data['fat_content'])
sb.kdeplot(norm_data['sodium_content'])
sb.kdeplot(norm_data['fiber_content'])
sb.kdeplot(norm_data['hydrated_carbon'])
sb.kdeplot(norm_data['sugars_content']) 
sb.kdeplot(norm_data['potassium_content'])
sb.kdeplot(norm_data['vit_&_min'])
sb.kdeplot(norm_data['fiber_content'])
sb.kdeplot(norm_data['ratings'])

## Prediction Model

In [None]:
# Using Linear Regression:

from sklearn.model_selection import train_test_split   #for spliting the dataset
from sklearn.linear_model import LinearRegression   #for linear regression

In [None]:
norm_data.head(3)

In [None]:
X = norm_data.drop('ratings', axis = 1)
Y = norm_data.ratings

In [None]:
print(X.shape)
print(Y.shape)

In [None]:
# r2 for test data:

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 0)

#Model Intialization
reg = LinearRegression()

#Data Fitting
reg = reg.fit(X_train, Y_train)
print('Coefficients: ', reg.coef_)
print('Intercept: ', reg.intercept_)

In [None]:
Y_pred = reg.predict(X_test)     #for test set

# Y_pred = reg.predict(X_train)       #for training set

In [None]:
# Model Evaluation:

from sklearn.metrics import r2_score, mean_squared_error

rmse = np.sqrt(mean_squared_error(Y_test, Y_pred))    #for test set

# rmse = np.sqrt(mean_squared_error(Y_train, Y_pred))       #for training set

r2 = r2_score(Y_test, Y_pred)       #for test set

# r2 = r2_score(Y_train, Y_pred)      #for training set

print('RMSE = ', rmse)
print('R2 Score = ', r2*100)

In [None]:
#residual plot

x = [i for i in range(1, len(Y_pred) + 1)]
x_plot = plt.scatter(x, (Y_pred - Y_test), c = 'b')
plt.plot(x, [0]*len(Y_pred), c = 'r')
plt.title('Residual Plot')