# Business case

We are trying to launch a new shopify store. We want to sell a product that is expected to get more popular or sell to an interest group that is expected to increase.

In [2]:
#imports and instantiations
from pytrends.request import TrendReq
import tweepy
from statsmodels.tsa import ar_model, stattools, arima_model
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import numpy as np
from textblob import TextBlob
import pandas as pd
from potosnail import Stats
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import numpy as np
import plotly.express as px
from scipy.signal import find_peaks

In [17]:
#helper functions
def GetReport(keywords, span='today 5-y', geo='', quiet=True):
    '''observe a search term's popularity in the past 5 years'''
    pytrends = TrendReq(hl='en-US', tz=360)
    pytrends.build_payload(keywords, cat=0, timeframe=span, geo=geo, gprop='')
    ts = pytrends.interest_over_time().drop(['isPartial'], axis='columns')
    if quiet == False:
        print(ts.plot())
    return ts

def AnalyzeTwitter(keyword):
    '''find the average sentimental value and subjectivity of a given search term'''
    c1 = 'aHXduTrDkva3ItY52tUtYVPvA'
    c2 = 'Qs6d4oNT3zXxDqOhita7IG07CfAJGceoqIs1sGuA4OURlbLP6d'
    a1 = '1181578611171762177-sGQaj7E9fpWi2aEB3MfWL4nTRovXYk'
    a2 = 'wa77yBJZJSOKOAzdaJYDruc9U1HrGhzyDhWgKvSQpm2hv'
    auth = tweepy.OAuthHandler(c1, c2)
    auth.set_access_token(a1, a2)
    api = tweepy.API(auth)
    topic = api.search(keyword)
    sent = 0
    sub = 0
    sents = []
    for i in range(len(topic)):
        tweet = topic[i]._json['text'].replace('@', '')
        blob = TextBlob(tweet)
        sents.append(blob.sentiment[0])
        sent += blob.sentiment[0]/len(topic)
        sub += blob.sentiment[1]/len(topic)
    return sent, sub, sents

def Collect(keyword, quiet=True):
    row = {}
    tsdf = BuildTS(keyword)
    row['term'] = keyword
    current_popularity = list(tsdf[keyword][:260])[-1]
    row['current_popularity'] =  current_popularity
    row['change_3mo'] = '{}%'.format(round(((tsdf[keyword][271] - current_popularity) / current_popularity) * 100, 1))
    row['change_6mo'] = '{}%'.format(round(((tsdf[keyword][283] - current_popularity) / current_popularity) * 100, 1))
    row['change_9mo'] = '{}%'.format(round(((tsdf[keyword][295] - current_popularity) / current_popularity) * 100, 1))
    row['change_12mo'] = '{}%'.format(round(((tsdf[keyword][307] - current_popularity) / current_popularity) * 100, 1))
    row['change_24mo'] = '{}%'.format(round(((tsdf[keyword][355] - current_popularity) / current_popularity) * 100, 1))
    try:
        row['popularity_2y'] = round((((tsdf[keyword][355] - current_popularity) / current_popularity) + 1) * current_popularity)
    except:
        row['popularity_2y'] = round(tsdf[keyword][355])
    sentiment, subjectivity, sentiments = AnalyzeTwitter(keyword)
    row['sentiment'] = round(sentiment, 2)
    row['subjectivity'] = round(subjectivity, 2)
    row['sentiments_std'] = round(np.std(sentiments), 2)
    if quiet == True:
        return row
    else:
        return tsdf, row

def CollectLoop(terms_list):
    '''tells us how popularity for a given list of search terms are expected to change'''
    df = pd.DataFrame(Collect(terms_list[0]), index=[0])
    for term in terms_list[1:]:
        temp = pd.DataFrame(Collect(term), index=[0])
        df = pd.concat([df, temp])
    return df.reset_index().drop(['index'], axis='columns')

def PlotOne(keyword):
    '''the output a user gets when looking at one term'''
    ts, results = Collect(keyword, quiet=False)
    subj = results['subjectivity']
    obj = 1 - subj
    X = ['%subjective', '%objective']
    y = [subj, obj]
    X2 = ['sentiment']
    y2 = results['sentiment']
    if results['popularity_2y'] > results['current_popularity']:
        future = 'increase'
    else:
        future = 'decrease'
    fig = go.Figure(go.Indicator(
    domain = {'x': [0, 1], 'y': [0, 1]},
    value = results['sentiment'],
    mode = "gauge+number",
    title = {'text': "Sentiment of '{}' based on tweets".format(keyword)},
    gauge = {'axis': {'range': [-1, 1]},
             'steps' : [
                 {'range': [-1, 0], 'color': "red"},
                 {'range': [0, 1], 'color': "lightgreen"}]}))
    fig.show()
    fig = go.Figure(go.Indicator(
    domain = {'x': [0, 1], 'y': [0, 1]},
    value = results['subjectivity'],
    mode = "gauge+number",
    title = {'text': "Subjectivity of '{}' based on tweets".format(keyword)},
    gauge = {'axis': {'range': [0, 1]},
             'steps' : [
                 {'range': [0, 0.5], 'color': "yellow"},
                 {'range': [0.5, 1], 'color': "blue"}]}))
    fig.show()
    fig = px.line(ts, x='index', y=keyword, range_y=[0, 100])
    fig.show()
    
def PlotMany(keywords):
    df = CollectLoop(keywords)
    fig = px.bar(df, x='term', y='current_popularity', color='sentiment', range_y=[0, 100])
    fig.show()
    for i in range(len(keywords)):
        ser = Collect(keywords[i], quiet=False)[0]
        fig = px.line(ser, x='index', y=keywords[i], range_y=[0, 100])
        fig.show()
        
def CheckSeasonality(ser, quiet=True):
    varience = 0
    for i in range(len(ser)):
        varience += abs(np.mean(ser)[0] - ser.iloc[i][0])
    delta = abs(np.mean(ser.iloc[235:])[0] - np.mean(ser.iloc[:27])[0])
    si = varience/delta
    if quiet == False:
        print(si)
    if si > 250:
        return True
    else:
        return False
    
def BuildTS(keyword):
    ser = GetReport([keyword])
    s = CheckSeasonality(ser)
    if s == True:
        my_order = (2,1,2) #probably wrong, also needs to be programatic
        my_seasonal_order = (2, 1, 2, 52) #probably wrong, also needs to be programatic
        model = SARIMAX(ser, order=my_order, seasonal_order=my_seasonal_order).fit()
        pred = model.predict(start=len(ser), end=356)
        ser_ = pd.DataFrame(ser)
        pred_ = pd.DataFrame(pred)
        pred_.columns = [keyword]
        ser_.columns = [keyword]
        return pd.concat([ser_, pred_]).reset_index()
    if s == False:
        model = ar_model.AutoReg(ser, lags=4).fit()
        pred = model.predict(start=len(ser), end=356)
        ser_ = pd.DataFrame(ser)
        pred_ = pd.DataFrame(pred)
        pred_.columns = [keyword]
        ser_.columns = [keyword]
        return pd.concat([ser_, pred_]).reset_index()
    
def PredictSearches(to_predict):
    if type(to_predict) == str:
        return PlotOne(to_predict)
    if type(to_predict) == list:
        if len(to_predict) == 1:
            return PlotOne(to_predict[0])
        else:
            return PlotMany(to_predict)
        
def CovidCheck(peaks):
    peaks = np.array(peaks)
    spike = np.array(list(range(205, 226)))
    affected = np.intersect1d(spike, peaks)
    regular = 0
    for peak in affected:
        if affected-52 in list(peaks):
            regular += 1
    return len(affected)!=0 and regular==0

# EDA

1. How is the popularity of the most searched terms expected to change in the next 2 years?
2. How do the average sentiments and standard deviations of 2020's top searches compare to that of 2019 and 2018?
3. identify niches that are popular for shopify stores and analyze their expected change in popularity. Next Identify a good niche to get into and inspect it as a search term
4. Do the same for products

## Question 1

In [10]:
pytrends = TrendReq(hl='en-US', tz=360)
top_2020 = list(pytrends.top_charts(2020)['title'])
PredictSearches(top_2020)

Most of these trends are expected to descrease signifigantly, 'Joe Biden' has been misclassified as seasonal which causes the model to predict very unlikley spikes in popularity. 

conclusion: avoid launching Stores around rapidly popping up trends, as they are unlikley to last long

## Question 2

In [11]:
# 2018
top_2018 = list(pytrends.top_charts(2018)['title'])

def Answer2(trends):
    term = []
    mean = []
    std = []
    for trend in trends:
        term.append(trend)
        _, sub, sents = AnalyzeTwitter(trend)
        sent = np.mean(sents)
        mean.append(sent)
        sent_std = np.std(sents)
        std.append(sent_std)
    results = pd.DataFrame({'term': term, 'mean': mean, 'std': std})
    fig = px.bar(results, x='term', y='mean', color='std')
    fig.show()
    am = round(np.mean(results['mean']), 2)
    ast = round(np.mean(results['std']), 2)
    return 'the average term had a mean sentiment of {} and a standard deviation of {}'.format(am, ast)
        
Answer2(top_2018)

'the average term had a mean sentiment of 0.04 and a standard deviation of 0.16'

In [13]:
# 2019
top_2019 = list(pytrends.top_charts(2019)['title'])
Answer2(top_2019)

'the average term had a mean sentiment of 0.04 and a standard deviation of 0.14'

In [14]:
# 2020
top_2020 = list(pytrends.top_charts(2020)['title'])
Answer2(top_2020)

'the average term had a mean sentiment of 0.05 and a standard deviation of 0.18'

2020 was, on average, the highest for average sentiment (but not by much). However, probably due to the controversial topics that year presented, the searches also had the highest standard deviation. This means that the sentiment of tweets on these topics are more spread out indicating more 'mixed feelings' ie. many were happy with the election but many weren't. 

## Question 3

In [24]:
hobbies = ['biking', 'gardening', 'surfing', 'home design', 'workouts']
products = ['face mask', 'bike helmet', 'swim trunks', 'back brace', 'puzzles']

In [28]:
PredictSearches(hobbies)

In [25]:
PredictSearches(products)

In [80]:
def _(ser, quiet=True):
    kw = list(ser.columns)[0]
    varience = 0
    for i in range(len(ser)):
        varience += abs(np.mean(ser)[0] - ser.iloc[i][0])
    delta = abs(np.mean(ser.iloc[235:])[0] - np.mean(ser.iloc[:27])[0])
    si = varience/delta
    x = np.array(list(GetReport([kw])[kw]))
    peaks, _ = find_peaks(x, prominence=10, distance=52)
    n_peaks = len(peaks)
    if quiet == False:
        print(peaks, si)
    if si > 250:
        if n_peaks < 3:
            return False
        else:
            return True
    else:
        if n_peaks > 4:
            return True
        else:
            return False

In [131]:
def CovidCheck(peaks):
    peaks = np.array(peaks)
    spike = np.array(list(range(205, 226)))
    affected = np.intersect1d(spike, peaks)
    regular = 0
    for peak in affected:
        if affected-52 in list(peaks):
            regular += 1
    return len(affected)!=0 and regular==0

In [132]:
import datetime

In [141]:
datetime.date.today().strftime("%d/%m/%Y")

'31/03/2021'