# Current Data Sources

1. Google Trends API
2. Twitter API

# Current Features
1. term	
2. current_popularity	
3. change_3mo	
4. change_6mo	
5. change_9mo	
6. change_12mo	
7. change_24mo	
8. popularity_2y	
9. sentiment
10. subjectivity

# Visulizations to make...
1. Sentiment PDF
2. popularity and predicted popularity

# Features to add...
1. sentiment standard deviation

In [18]:
#imports and instantiations
from pytrends.request import TrendReq
import tweepy
from statsmodels.tsa import ar_model, stattools, arima_model
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import numpy as np
from textblob import TextBlob
import pandas as pd
from potosnail import Stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import numpy as np
import plotly.express as px

In [52]:
#helper functions
def GetReport(keywords, span='today 5-y', geo='', quiet=True):
    '''observe a search term's popularity in the past 5 years'''
    pytrends = TrendReq(hl='en-US', tz=360)
    pytrends.build_payload(keywords, cat=0, timeframe=span, geo=geo, gprop='')
    ts = pytrends.interest_over_time().drop(['isPartial'], axis='columns')
    if quiet == False:
        print(ts.plot())
    return ts

def AnalyzeTwitter(keyword):
    '''find the average sentimental value and subjectivity of a given search term'''
    c1 = 'aHXduTrDkva3ItY52tUtYVPvA'
    c2 = 'Qs6d4oNT3zXxDqOhita7IG07CfAJGceoqIs1sGuA4OURlbLP6d'
    a1 = '1181578611171762177-sGQaj7E9fpWi2aEB3MfWL4nTRovXYk'
    a2 = 'wa77yBJZJSOKOAzdaJYDruc9U1HrGhzyDhWgKvSQpm2hv'
    auth = tweepy.OAuthHandler(c1, c2)
    auth.set_access_token(a1, a2)
    api = tweepy.API(auth)
    topic = api.search(keyword)
    sent = 0
    sub = 0
    sents = []
    for i in range(len(topic)):
        tweet = topic[i]._json['text'].replace('@', '')
        blob = TextBlob(tweet)
        sents.append(blob.sentiment[0])
        sent += blob.sentiment[0]/len(topic)
        sub += blob.sentiment[1]/len(topic)
    return sent, sub, sents

def Collect(keyword, quiet=True):
    row = {}
    tsdf = BuildTS(keyword)
    row['term'] = keyword
    current_popularity = list(tsdf[keyword][:260])[-1]
    row['current_popularity'] =  current_popularity
    row['change_3mo'] = '{}%'.format(round(((tsdf[keyword][271] - current_popularity) / current_popularity) * 100, 1))
    row['change_6mo'] = '{}%'.format(round(((tsdf[keyword][283] - current_popularity) / current_popularity) * 100, 1))
    row['change_9mo'] = '{}%'.format(round(((tsdf[keyword][295] - current_popularity) / current_popularity) * 100, 1))
    row['change_12mo'] = '{}%'.format(round(((tsdf[keyword][307] - current_popularity) / current_popularity) * 100, 1))
    row['change_24mo'] = '{}%'.format(round(((tsdf[keyword][355] - current_popularity) / current_popularity) * 100, 1))
    row['popularity_2y'] = round((((tsdf[keyword][355] - current_popularity) / current_popularity) + 1) * current_popularity)
    sentiment, subjectivity, sentiments = AnalyzeTwitter(keyword)
    row['sentiment'] = round(sentiment, 2)
    row['subjectivity'] = round(subjectivity, 2)
    row['sentiments_std'] = round(np.std(sentiments), 2)
    if quiet == True:
        return row
    else:
        return tsdf, row

def CollectLoop(terms_list):
    '''tells us how popularity for a given list of search terms are expected to change'''
    df = pd.DataFrame(Collect(terms_list[0]), index=[0])
    for term in terms_list[1:]:
        temp = pd.DataFrame(Collect(term), index=[0])
        df = pd.concat([df, temp])
    return df.reset_index().drop(['index'], axis='columns')

def PlotOne(keyword):
    '''the output a user gets when looking at one term'''
    ts, results = Collect(keyword, quiet=False)
    subj = results['subjectivity']
    obj = 1 - subj
    X = ['%subjective', '%objective']
    y = [subj, obj]
    X2 = ['sentiment']
    y2 = results['sentiment']
    if results['popularity_2y'] > results['current_popularity']:
        future = 'increase'
    else:
        future = 'decrease'
    fig = go.Figure(go.Indicator(
    domain = {'x': [0, 1], 'y': [0, 1]},
    value = results['sentiment'],
    mode = "gauge+number",
    title = {'text': "Sentiment of '{}' based on tweets".format(keyword)},
    gauge = {'axis': {'range': [-1, 1]},
             'steps' : [
                 {'range': [-1, 0], 'color': "red"},
                 {'range': [0, 1], 'color': "lightgreen"}]}))
    fig.show()
    fig = go.Figure(go.Indicator(
    domain = {'x': [0, 1], 'y': [0, 1]},
    value = results['subjectivity'],
    mode = "gauge+number",
    title = {'text': "Subjectivity of '{}' based on tweets".format(keyword)},
    gauge = {'axis': {'range': [0, 1]},
             'steps' : [
                 {'range': [0, 0.5], 'color': "yellow"},
                 {'range': [0.5, 1], 'color': "blue"}]}))
    fig.show()
    fig = px.line(ts, x='index', y=keyword, range_y=[0, 100])
    fig.show()
    
def PlotMany(keywords):
    df = CollectLoop(keywords)
    fig = px.bar(df, x='term', y='current_popularity', color='sentiment', range_y=[0, 100])
    fig.show()
    for i in range(len(keywords)):
        ser = Collect(keywords[i], quiet=False)[0]
        fig = px.line(ser, x='index', y=keywords[i], range_y=[0, 100])
        fig.show()
        
def CheckSeasonality(ser):
    varience = 0
    for i in range(len(ser)):
        varience += abs(np.mean(ser)[0] - ser.iloc[i][0])
    delta = abs(np.mean(ser.iloc[235:])[0] - np.mean(ser.iloc[:27])[0])
    si = varience/delta
    if si <= 250:
        return True
    else:
        return False
    
def BuildTS(keyword):
    ser = GetReport([keyword])
    s = CheckSeasonality(ser)
    if s == True:
        my_order = (2,1,2) #probably wrong, also needs to be programatic
        my_seasonal_order = (2, 1, 2, 52) #probably wrong, also needs to be programatic
        model = SARIMAX(ser, order=my_order, seasonal_order=my_seasonal_order).fit()
        pred = model.predict(start=len(ser), end=356)
        ser_ = pd.DataFrame(ser)
        pred_ = pd.DataFrame(pred)
        pred_.columns = [keyword]
        ser_.columns = [keyword]
        return pd.concat([ser_, pred_]).reset_index()
    if s == False:
        model = ar_model.AutoReg(ser, lags=4).fit()
        pred = model.predict(start=len(ser), end=356)
        ser_ = pd.DataFrame(ser)
        pred_ = pd.DataFrame(pred)
        pred_.columns = [keyword]
        ser_.columns = [keyword]
        return pd.concat([ser_, pred_]).reset_index()
    
def PredictSearches(to_predict):
    if type(to_predict) == str:
        return PlotOne(to_predict)
    if type(to_predict) == list:
        if len(to_predict) == 1:
            return PlotOne(to_predict[0])
        else:
            return PlotMany(to_predict)

In [51]:
CollectLoop(top_2020)

ValueError: cannot convert float NaN to integer

In [20]:
hobbies = ['biking', 'gardening', 'surfing', 'home design', 'workouts']
products = ['face mask', 'bike helmet', 'board shorts', 'back brace', 'puzzles']

In [21]:
upward = GetReport(['mom jeans'])
seasonal = GetReport(['sleeping bags'])
spike_2020 = GetReport(['face mask'])
little_trend = GetReport(['ashtray'])

In [53]:
PredictSearches(hobbies)    

# Checking acf and pacf

In [22]:
upward_acf = stattools.acf(upward)
seasonal_acf = stattools.acf(seasonal)
spike_2020_acf = stattools.acf(spike_2020)
little_trend_acf = stattools.acf(little_trend)

upward_pacf = stattools.pacf(upward)
seasonal_pacf = stattools.pacf(seasonal)
spike_2020_pacf = stattools.pacf(spike_2020)
little_trend_pacf = stattools.pacf(little_trend)

helpful reference: https://towardsdatascience.com/time-series-essentials-fe6727ab6a94

# EDA

1. How is the popularity of the most searched terms expected to change in the next 2 years?
2. How do the average sentiments and standard deviations of 2020's top searches compare to that of 2019 and 2018?

In [42]:
pytrends = TrendReq(hl='en-US', tz=360)
top_2020 = list(pytrends.top_charts(2020)['title'])
PredictSearches(top_2020)

In [58]:
df = pd.DataFrame(Collect('coronavirus'), index=[0])
temp = pd.DataFrame(Collect('Election results'), index=[0])
temp2 = BuildTS('Election results')

ValueError: cannot convert float NaN to integer

In [None]:
def Collect(keyword, quiet=True):
    row = {}
    tsdf = BuildTS(keyword)
    row['term'] = keyword
    current_popularity = list(tsdf[keyword][:260])[-1]
    row['current_popularity'] =  current_popularity
    row['change_3mo'] = '{}%'.format(round(((tsdf[keyword][271] - current_popularity) / current_popularity) * 100, 1))
    row['change_6mo'] = '{}%'.format(round(((tsdf[keyword][283] - current_popularity) / current_popularity) * 100, 1))
    row['change_9mo'] = '{}%'.format(round(((tsdf[keyword][295] - current_popularity) / current_popularity) * 100, 1))
    row['change_12mo'] = '{}%'.format(round(((tsdf[keyword][307] - current_popularity) / current_popularity) * 100, 1))
    row['change_24mo'] = '{}%'.format(round(((tsdf[keyword][355] - current_popularity) / current_popularity) * 100, 1))
    row['popularity_2y'] = round((((tsdf[keyword][355] - current_popularity) / current_popularity) + 1) * current_popularity)
    sentiment, subjectivity, sentiments = AnalyzeTwitter(keyword)
    row['sentiment'] = round(sentiment, 2)
    row['subjectivity'] = round(subjectivity, 2)
    row['sentiments_std'] = round(np.std(sentiments), 2)
    if quiet == True:
        return row
    else:
        return tsdf, row