In [153]:
 ## An Analysis of the trends in nutrient sentiments in official USDA Dietary guidelines over years (1980-Present) ##
    
                                ##        By: SHIVAM SAITH       ##
                                          

In [154]:
#Importing the necessary packages

#Plotly: Visualizations
#textblob: Sentiment Analysis
#vaderSentiment: Sentiment Analysis
#re:  Regular Expressions
#texttable: For displaying data in a pretty tabular format

from plotly import __version__
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import sys
import re
import vaderSentiment as vs
import plotly.graph_objs as go
!{sys.executable} -m pip install textblob
!{sys.executable} -m pip install texttable

from texttable import Texttable
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()



In [155]:
# The mean functions used in calculating the average polarity of the sentiment  
# for all the sentiment values obtained for a particular year
def mean(polarityList):
    return float(sum(polarityList)) / max(len(polarityList), 1)

# Takes into consideration the size of the document 
def normalizedMean(polarityList):
    return float(float(sum(polarityList)) / max(len(polarityList), 1))/max(len(polarityList), 1)
    

In [156]:
# This method takes the year value and the name of the nutrient and returns a list with the sentiment polarity values
# for all the statements(sentences) containing one or more than once occurances of that nutrient name in that statement
# Method also takes in the package name to determine which package to use while calculating sentiment polarity scores.
def getNutrientRelevantStatementsPolarityList(year, nutrientName,packageName): 
    with open(year + '.txt', 'r') as myfile:
        text = myfile.read()
       
    # Data Pre-processing
    text1 =text.replace('\n________________\n','\n') #Removing the very common dirty phrase in the converted OCR text
    text2 = re.sub(r'\n([a-z]+)',r' \1', text1) # To remove erroneous new line characters '\n' in between a sentence.
    text3 =re.sub(r'\. ([a-z]+)',r' \1', text2) # To remove the last '.' used for abbreviations so that next steps dont confuse this with a new statement indicating '.'            
    text4 = re.sub('[^ A-Za-z0-9\.\n]+','',text3) # To remove special characters except full stops and new line characters
    list1 = [sentence for sentence in text4.split('\n')] # Splitting on the basis of new line characters
    list2 = []
    for i in list1:
        list2.append(i.split('.'))
    
    finalList = []
    
    for i in list2:
            for j in i:
                finalList.append(j)
    
    nutrientStatementList = []
    
    for i in finalList:
        if nutrientName.lower() in i:
            nutrientStatementList.append(i)
    
    
    statementsPolarityList = []
    
    #Choosing the package to calculate the sentiment polarities with
    for i in nutrientStatementList:
            if packageName.lower() == 'textblob':
                   statementsPolarityList.append(TextBlob(i).sentiment.polarity)
            else:
                   statementsPolarityList.append((analyzer.polarity_scores(i))['compound'])      
            
    return statementsPolarityList; 




In [157]:
#Methods to separate out the positive, negative and neutral polarity values (used in the visualizations later)

def getPositivePolarityList(nutrientPolarityList): 
    positivePolarityList = []
    for i in nutrientPolarityList:
        if i>0:
               positivePolarityList.append(i)
    return positivePolarityList;  

def getNegativePolarityList(nutrientPolarityList): 
    negativePolarityList = []
    for i in nutrientPolarityList:
        if i<0:
               negativePolarityList.append(i)
    return negativePolarityList; 

def getNeutralPolarityList(nutrientPolarityList): 
    neutralPolarityList = []
    for i in nutrientPolarityList:
        if i==0:
               neutralPolarityList.append(i)
    return neutralPolarityList; 

In [158]:
# Method to give a scatter plot for the polarity values of the statements containing the nutrient name in a particular
# year's dietary guideline. The method takes the year  value, the name of the nutrient, the list containing the 
# sentiment polarity values and the name of the package (I have used both TextBlob and Vader Sentiment which give
# slightly different results for polarity values).

def plotPolarityScatterPlot (nutrient,nutrientStatementPolarityList,year,packageName):
    import plotly
    plotly.tools.set_credentials_file(username='shivam.saith', api_key='xCijI2Ae8pZolWXkXMxN')
    import plotly.plotly as py
    import plotly.graph_objs as go

    import numpy as np


    trace0 = go.Scatter(
        x = np.arange(1,1000,2),
        y = getPositivePolarityList(nutrientStatementPolarityList),
        name = 'Positive',
        mode = 'markers',
        marker = dict(
            size = 10,
            color = 'rgba(0, 153, 51, .8)',
            line = dict(
                width = 2,
                color = 'rgb(0, 0, 0)'
            )
        )
    )

    trace1 = go.Scatter(
        x = np.arange(2,1001,2),
        y = getNegativePolarityList(nutrientStatementPolarityList),
        name = 'Negative',
        mode = 'markers',
        marker = dict(
            size = 10,
            color = 'rgb(230, 46, 0)',
            line = dict(
                width = 2,
                color = 'rgb(0, 0, 0)'
            )
        )
    )

    
    data = [trace0, trace1]
    

    layout = dict(title = 'Sentiment(using '+packageName+ ') scatter plot for ' + nutrient + ' ' + '(' + year +')',
                  yaxis = dict(title = 'Polarity',zeroline = False),
                  xaxis = dict(title = 'Statement #(not in original order)',zeroline = False)
                 )

    fig = dict(data=data, layout=layout)
    return fig;

In [159]:
# This method is used to generate a trendline of the mean sentiment polarity values for the given nutrient
# for all the years in the corpus(1980-2015)
#USDA Dietary guidelines are released every 5 years and the latest was released in 2015, next will be in 2020 

def getNutrientPolarityTrendLine(nutrientName,package):
    import numpy as np
    yearsList = np.arange(1980,2020,5)
    polarityMeansAcrossYearsList = []
    polarityHighsAcrossYearsList = []
    polarityLowsAcrossYearsList = []
    for i in yearsList:
        polarityMeansAcrossYearsList.append(mean(getNutrientRelevantStatementsPolarityList(year=str(i),nutrientName = nutrient,packageName=package)))
        polarityHighsAcrossYearsList.append(max(getNutrientRelevantStatementsPolarityList(year=str(i),nutrientName = nutrient,packageName=package)))
        polarityLowsAcrossYearsList.append(min(getNutrientRelevantStatementsPolarityList(year=str(i),nutrientName = nutrient,packageName=package)))
   
    trace_high = go.Scatter(
                 x= yearsList,
               y= polarityHighsAcrossYearsList,
                name = "Highest Polarity",
                line = dict(color = 'rgba(0, 153, 51, .8)'),
                opacity = 0.8)

    trace_low = go.Scatter(
                 x= yearsList,
                y= polarityLowsAcrossYearsList,
                name = "Lowest Polarity",
                line = dict(color = 'rgb(230, 46, 0)'),
                opacity = 0.8)     
        
    trace_mean = go.Scatter(
                 x= yearsList,
                y= polarityMeansAcrossYearsList,
                name = "Mean Polarity",
                line = dict(color = '#7F7F7F'),
                opacity = 0.8)       
        
    data = [trace_high,trace_low]
    layout = dict(
         title = 'Max and min polarity trends(using '+package+') in the sentiment of ' + nutrientName + '(s) ' + '(1980-2015)' ,
         yaxis = dict(title = 'Polarity'),
         xaxis = dict(title = 'Year')
    )

    fig = dict(data=data, layout=layout)
    return fig;

In [160]:
# This method is used to generate a trendline of the mean sentiment polarity values for the given nutrient
# for all the years in the corpus(1980-2015)
#USDA Dietary guidelines are released every 5 years and the latest was released in 2015, next will be in 2020 

def getNutrientPolarityMeanTrends(nutrientName,package):
    import numpy as np
    yearsList = np.arange(1980,2020,5)
    polarityMeansAcrossYearsList = []
    polarityHighsAcrossYearsList = []
    polarityLowsAcrossYearsList = []
    for i in yearsList:
        polarityMeansAcrossYearsList.append(mean(getNutrientRelevantStatementsPolarityList(year=str(i),nutrientName = nutrient,packageName=package)))
    trace_mean = go.Scatter(
                 x= yearsList,
                y= polarityMeansAcrossYearsList,
                name = "Mean Polarity",
                line = dict(color = '#66ccff'),
                opacity = 0.8)       
        
    data = [trace_mean]
    layout = dict(
         title = 'Mean sentiment trends (using '+package+') for ' + nutrientName + '(s) ' + '(1980-2015)' ,
         yaxis = dict(title = 'Mean Polarity'),
         xaxis = dict(title = 'Year')
    )

    fig = dict(data=data, layout=layout)
    return fig;

In [161]:
# This method gives a dictionary where the keys are the individual statements(containing the given nutrient name)
# and the values are the associated polarity sentiment values. 
# This method helps you get an internal view of how the statements are segregated from the original dietray guideline 
# text documents and and what is each such statement's polarity value.

def getNutrientRelevantStatementsPolarityDictionary(year, nutrientName,packageName): 
    with open(year + '.txt', 'r') as myfile:
        text = myfile.read()
    text1 =text.replace('\n________________\n','\n') #Removing the very common dirty phrase in the converted OCR text
    text2 = re.sub(r'\n([a-z]+)',r' \1', text1) # To remove erroneous new line characters '\n' in between a sentence.
    text3 =re.sub(r'\. ([a-z]+)',r' \1', text2) # To remove the last '.' used for abbreviations so that next steps dont confuse this with a new statement indicating '.'            
    text4 = re.sub('[^ A-Za-z0-9\.\n]+','',text3) # To remove special characters except full stops and new line characters
    list1 = [sentence for sentence in text4.split('\n')] # Splitting on the basis of new line characters
    list2 = []
    for i in list1:
        list2.append(i.split('.'))
    finalList = []
    for i in list2:
            for j in i:
                finalList.append(j)
    
    nutrientStatementList = []
    
    for i in finalList:
        if nutrientName.lower() in i:
            nutrientStatementList.append(i)
    
    
    dict = {}
    
    for i in nutrientStatementList:
            if packageName.lower() == 'textblob':
                   dict[i] = TextBlob(i).sentiment.polarity
            else:
                    dict[i] = (analyzer.polarity_scores(i))['compound']
    return dict; 




In [162]:
## Diplaying the statement and the respective sentiment polarity values calculated using TextBlob for the nutrient 
#  'Protein' for the year 2010 (2010 USDA Dietary Guidelines document) 

statementPolarityList = []

statementPolarityList.append(['Statement','Polarity'])

  
for statement, polarity in getNutrientRelevantStatementsPolarityDictionary(year = '2010', nutrientName = 'Protein',packageName='TextBlob').items():
    statementPolarityList.append([statement,polarity])

t = Texttable()
t.add_rows(statementPolarityList)
print(t.draw())

+-------------------------------------------------------------------+----------+
|                             Statement                             | Polarity |
|  Choose a variety of protein foods which include seafood lean     | 0.400    |
| meat and poultry eggs beans and peas soy products and unsalted    |          |
| nuts and seeds                                                    |          |
+-------------------------------------------------------------------+----------+
|  Replace protein foods that are higher in solid fats with choices | 0.083    |
| that are lower in solid fats and calories andor are sources of    |          |
| oils                                                              |          |
+-------------------------------------------------------------------+----------+
|  Approximately lesterol protein carbohydrates sodium potassium 32 | -0.400   |
| percent of children and adolescents ages 2 to and water alcohol   |          |
| and food safety and techno

In [163]:
# Invoking the scatter plot method below to get the sentiment polarity values(Calculated using TextBlob) for the 
# statements containing Vitamin for the USDA Dietary guideline in the year 2010. 

requiredNutrient = 'Vitamin'
requiredYear = '2010'

iplot(plotPolarityScatterPlot(nutrient = requiredNutrient,nutrientStatementPolarityList = getNutrientRelevantStatementsPolarityList(year = requiredYear,nutrientName = requiredNutrient,packageName = 'TextBlob'), year = requiredYear,packageName='TextBlob'), filename='Sentiment Scater plot')

In [164]:
# Invoking the scatter plot method below to get the sentiment polarity values(Calculated using Vader sentiment) for the 
# statements containing Vitamin for the USDA Dietary guideline in the year 2010. 

#Note: The count of statements is different in the plots using Textblob and Vader because we dont intend to show 
# the neutral polarity statement values (all zeroes) and both libraries have different inbuilt logic to calculate
# a neutral polarity for a statement.

requiredNutrient = 'Vitamin'
requiredYear = '2010'

iplot(plotPolarityScatterPlot(nutrient = requiredNutrient,nutrientStatementPolarityList = getNutrientRelevantStatementsPolarityList(year = requiredYear,nutrientName = requiredNutrient, packageName = 'Vader'), year = requiredYear,packageName='Vader'), filename='Sentiment Scatter plot')

In [188]:
# Invoking the scatter plot method below to get the sentiment polarity values(Calculated using Vader sentiment) for the 
# statements containing Vitamin for the USDA Dietary guideline in the year 2010. 

#Note: The count of statements is different in the plots using Textblob and Vader because we dont intend to show 
# the neutral polarity statement values (all zeroes) and both libraries have different inbuilt logic to calculate
# a neutral polarity for a statement.

requiredNutrient = 'Water'
requiredYear = '2010'

iplot(plotPolarityScatterPlot(nutrient = requiredNutrient,nutrientStatementPolarityList = getNutrientRelevantStatementsPolarityList(year = requiredYear,nutrientName = requiredNutrient, packageName = 'TextBlob'), year = requiredYear,packageName='TextBlob'), filename='Sentiment Scatter plot')

In [186]:
# Invoking the scatter plot method below to get the sentiment polarity values(Calculated using Vader sentiment) for the 
# statements containing Vitamin for the USDA Dietary guideline in the year 2010. 

#Note: The count of statements is different in the plots using Textblob and Vader because we dont intend to show 
# the neutral polarity statement values (all zeroes) and both libraries have different inbuilt logic to calculate
# a neutral polarity for a statement.

requiredNutrient = 'Water'
requiredYear = '2010'

iplot(plotPolarityScatterPlot(nutrient = requiredNutrient,nutrientStatementPolarityList = getNutrientRelevantStatementsPolarityList(year = requiredYear,nutrientName = requiredNutrient, packageName = 'Vader'), year = requiredYear,packageName='Vader'), filename='Sentiment Scatter plot')

In [166]:
#import plotly.plotly as py
#init_notebook_mode(connected=True)

nutrient = 'Carbohydrate'

iplot(getNutrientPolarityTrendLine(nutrientName=nutrient, package = 'Vader')) 

# Note : The difference between the max and min polarities for a year help understand the variation in the sentiment
#        of a nutrient for a particular year.

# Takeaway(s)

# The maximum range in sentiment for 'Carbohydrate' was in 2005 and the minimum was in 1985.

In [167]:
nutrient = 'Carbohydrate'

iplot(getNutrientPolarityMeanTrends(nutrientName=nutrient, package = 'Vader')) 

# Note # The Trenline below helps better identify the magnitude of the changes in the mean polarities over the years

# Takeaway(s)

# The trend that was seen for Carbohydrates from 1980 to 1995 seems to be repeating itself starting 2000.

In [169]:
nutrient = 'Protein'

iplot(getNutrientPolarityTrendLine(nutrientName= nutrient,package='Vader')) 

# Note : The difference between the max and min polarities for a year help understand the variation in the sentiment
#        polarity of a nutrient for a particular year.

#      Takeaway(s):

#      2010 had the maximum range of sentiment for Proteins whereas 2000 had minimum. 2000 had a minimal number of 
#      statements(mostly neutral) and the only statement with a polarity was positive in nature.


In [170]:
nutrient = 'Protein'

iplot(getNutrientPolarityMeanTrends(nutrientName=nutrient, package = 'Vader')) 

# Note # The Trenline below helps better identify the magnitude of the changes in the mean polarities over the years

# Takeaway(s)

# Even though not by much in terms of magnitude, the sentiment magnitude for protein seems to be following an alternate
# pattern every year till 2005.

In [172]:
nutrient = 'Vitamin'

iplot(getNutrientPolarityTrendLine(nutrientName= nutrient,package='Vader'))

# Note : The difference between the max and min polarities for a year help understand the variation in the sentiment
#        polarity of a nutrient for a particular year.

#Takeaway(s)

# 2010 had the maximum range of sentiment for Vitamins whereas 1980 had the mimnimum.

In [173]:
nutrient = 'Vitamin'

iplot(getNutrientPolarityMeanTrends(nutrientName=nutrient, package = 'Vader')) 

# Note # The Trenline below helps better identify the magnitude of the changes in the mean polarities over the years.

# Takeaway(s)

# The sentiment for Vitamins had an big dip in 1985 but picked up since with a drop again in 2000.

In [175]:
nutrient = 'Fat'

iplot(getNutrientPolarityTrendLine(nutrientName= nutrient,package='Vader'))

# Note : The difference between the max and min polarities for a year help understand the variation in the sentiment
#        of a nutrient for a particular year.

# Takeaway(s)

# The range of sentiment for fat in a particular year seems to be more or less consistent across the years. 

In [176]:
nutrient = 'Fat'

iplot(getNutrientPolarityMeanTrends(nutrientName=nutrient, package = 'Vader')) 

# Note # The Trenline below helps better identify the magnitude of the changes in the mean polarities over the years

## Takeaways:

# The sentiment for "Fats" had a peak sentiment in 1985 and a big dip in 2005.

# Fats have started being mentioned in a positive tone overall since 2005.

In [178]:
nutrient = 'Mineral'

iplot(getNutrientPolarityTrendLine(nutrientName= nutrient,package='Vader'))

# Note : The difference between the max and min polarities for a year help understand the variation in the sentiment
#        polarity of a nutrient for a particular year.

# Takeaway(s)

# The maximum range in sentiment for 'Minerals' was in 1990 and the minimum was in 1980.

In [179]:
nutrient = 'Mineral'

iplot(getNutrientPolarityMeanTrends(nutrientName=nutrient, package = 'Vader')) 

# Note # The Trenline below helps better identify the magnitude of the changes in the mean polarities over the years

# Takeaway(s)

# The sentiment for mineral had a big dip in 2005 but has picked up since.

In [181]:
nutrient = 'Water'

iplot(getNutrientPolarityTrendLine(nutrientName= nutrient,package='Vader'))

# Note : The difference between the max and min polarities for a year help understand the variation in the sentiment
#        polarity of a nutrient for a particular year.

# Takeaway(s)

# For 2 years out of the 8, the range of sentiment for water was Zero(overall sentiment being positive in both cases)


In [182]:
nutrient = 'Water'

iplot(getNutrientPolarityMeanTrends(nutrientName=nutrient, package = 'Vader')) 

# Note # The Trenline below helps better identify the magnitude of the changes in the mean polarities over the years

# Takeaway(s)

# After 1985, there do not seem to be major changes in the sentiment associated with water.