# Sentiment Analysis and Consumer Profiling

This script includes cleaning, EDA, feature creation, and some preliminary analysis. 

In [1]:
import pandas as pd
import numpy as np 
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
import nltk
import re
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib 
import matplotlib.patches as mpatches
import seaborn as sns
from textblob import TextBlob
from collections import Counter
from tqdm import tqdm
import scipy.stats as stats

In [2]:
food = pd.read_csv('./fastfood.csv', dtype=object, index_col=0)

In [3]:
food.shape

(418332, 14)

In [4]:
#dropping malformed data (invalid index)
food.drop(list(food.loc[food['unique_code'] == 'Nobody should be too big to fail...'].index),
          axis=0, inplace=True)

In [5]:
#dropping duplicate entries
food = food.drop_duplicates(subset='unique_code')

In [6]:
#eliminated more than 50% of observations
food.shape

(143983, 14)

In [7]:
food.columns

Index(['Company', 'favorite_count', 'number_of_people_they_follow',
       'number_of_user_tweets', 'retweet_count', 'text', 'time_tweeted',
       'unique_code', 'user_coordinates', 'user_followers_count',
       'user_is_verified', 'user_location', 'user_name', 'user_profile_text'],
      dtype='object')

In [8]:
food.reset_index(inplace=True, drop=True)

In [9]:
#dropping all rows with all null values
food = food.drop(food[food.isnull().all(axis=1)].index[0], axis=0).reset_index(drop=True)

In [10]:
#Dropping all values that are not company related (only 3 observations)
to_drop = []

for row_num, val in enumerate(food['Company']):
    if val[0] != '@':
        to_drop.append(row_num)
        
food = food.drop(to_drop, axis=0).reset_index(drop=True)

In [11]:
#replacing strings with integers
mapper = {'True': 1, 'False': 0}
food['user_is_verified'] = food.user_is_verified.map(mapper)

In [12]:
#filling nulls and converting data types
food['retweet_count'] = food.retweet_count.astype(int)

In [13]:
#Set to run Midnight and 5pm EST everyday, the times are in UTC, making EST
food['time_tweeted'] = pd.to_datetime(food['time_tweeted']) - pd.Timedelta(hours=4)

In [14]:
print('Final time range before terminating EC2', food['time_tweeted'].min(), food['time_tweeted'].max())

Final time range before terminating EC2 2018-03-03 13:56:18 2018-04-19 16:48:37


In [15]:
food['favorite_count'] = food.favorite_count.astype(int)
food['number_of_people_they_follow'] = food.number_of_people_they_follow.astype(int)
food['number_of_user_tweets'] = food.number_of_user_tweets.astype(int)
food['user_followers_count'] = food.user_followers_count.astype(int)

In [16]:
#creating a basic name category that isnt the handle

mapper = {'@DennysDiner': 'Dennys', '@ChipotleTweets': 'Chipotle',
         '@McDonalds': 'McDonalds', '@Wendys': 'Wendys', '@Starbucks':'Starbucks',
         '@dunkindonuts':'Dunkin_Donuts', '@dominos': 'Dominos', '@shakeshack': 'Shake_Shack',
         '@sonicdrivein': 'Sonic', '@wingstop': 'Wingstop', '@CrackerBarrel': 'Cracker_Barrel', 
         '@redrobinburgers': 'Red_Robin', '@Potbelly': 'Potbelly'}

food['name'] = food.Company.map(mapper)

Examined multiple methods of cleaning for sentiment data, only the final method runs now, but 
have kept my trial methods below (hashed out)

In [17]:
#shouldnt be removing stopwords before sentiment analysis:
#http://www.lrec-conf.org/proceedings/lrec2014/pdf/292_Paper.pdf
#testing different functions for preprocessing text for sentiment analysis


def Text_Cleaner(text, tokens=False):
    """Takes text, eliminates URLS, replaces contractions, tokenizes, 
    removes company names, lower cases, removes calls to twitter handles, 
    returns a string, same as version 1, but only looking at words"""
    text = re.sub(r'(https)[^\s]+', '', text)
    text = re.sub(r'can\'t', 'can not', text, flags=re.IGNORECASE)
    text = re.sub(r'don\'t', 'do not', text, flags=re.IGNORECASE)
    text = re.sub(r'isn\'t', 'is not', text, flags=re.IGNORECASE)
    text = re.sub(r'aren\'t', 'are not', text, flags=re.IGNORECASE)
    text = re.sub(r'wasn\'t', 'was not', text, flags=re.IGNORECASE)
    text = re.sub(r'weren\'t', 'were not', text, flags=re.IGNORECASE)
    text = re.sub(r'haven\'t', 'have not', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(rt|RT)', '', text)
    text = re.sub(r'@[a-zA-Z0-9]+', '', text)
    text = re.sub('#', '', text)
    text = re.sub(r'(wtf)+\b', 'what the fuck', text, flags=re.IGNORECASE)
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    if tokens:
        words = [word.lower() for word in words]
        return words
    return ' '.join(words)

# def Text_Cleaner_version_1(text):
#     """Takes text, eliminates URLS, replaces contractions, tokenizes, 
#     removes company names, lower cases, removes calls to twitter handles, 
#     returns a string"""
#     text = re.sub(r'(https)[^\s]+', '', text)
#     text = re.sub(r'\b(rt|RT)', '', text)
#     text = re.sub(r'@[a-zA-Z0-9]+', '', text)
#     text = re.sub('#', '', text)
#     return text

#     LOOKING AT VARIOUS METHODS FOR PREPROCESSING FOR SENTIMENT ANALYSIS
#     tokenizer = RegexpTokenizer(r'\w+')
#     words = tokenizer.tokenize(text)
#     lower = [x.lower() for x in words]
#     words = [word for word in words if word != 'rt']
#     eliminator = [re.sub(r'(mcdon|dunki|denn|redro|sonic|starb|shakesh|domino|crackerb|chipot|wend)[a-z]+','',x)
#                   for x in lower]
#     return ' '.join(eliminator2)

In [18]:
# #Creating a test set of uncleaned data to check the value of the Text_Cleaner functions
# food['text_sentiment_no_clean'] = food['text'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [19]:
# #converting text with version 1
# food['text_sentiment_v1'] = food['text'].apply(Text_Cleaner_version_1)
# #Calculating sentiment with TextBlob
# food['sentiment_score_v1'] = food['text_sentiment_v1'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [20]:
#converting text with version 2
food['text_sentiment'] = food['text'].apply(Text_Cleaner)
#Calculating sentiment with TextBlob
food['sentiment_score'] = food['text_sentiment'].apply(lambda x: TextBlob(x).sentiment.polarity)

In [21]:
#Looking at overall sentiment by company (version 1) (0 neutral, 1 positive, -1 negative)
food.groupby('Company')['sentiment_score'].mean().sort_values()

Company
@dominos            0.054447
@McDonalds          0.088885
@DennysDiner        0.089561
@dunkindonuts       0.093069
@Wendys             0.106140
@redrobinburgers    0.114117
@wingstop           0.115563
@Starbucks          0.120630
@sonicdrivein       0.122653
@Potbelly           0.141328
@ChipotleTweets     0.143646
@CrackerBarrel      0.152685
@shakeshack         0.169174
Name: sentiment_score, dtype: float64

In [22]:
#creating sentiment dummy variables 

def dummy_maker(val):
    """Takes in a float and returns a dummy based on the value
    to be used in pandas.apply"""
    if val == 0:
        return 0
    elif val > 0:
        return 1
    else:
        return -1

food['sentiment_dummies'] = food['sentiment_score'].apply(dummy_maker)

# food['sentiment_dummies_v2'] = food['sentiment_score_v2'].apply(dummy_maker)

# food['sentiment_dummies_uncleaned'] = food['text_sentiment_no_clean'].apply(dummy_maker)

In [23]:
# print(food['sentiment_dummies_v1'].value_counts()) 
# print(food['sentiment_dummies_v2'].value_counts())
print(food['sentiment_dummies'].value_counts())

 0    61842
 1    59187
-1    22949
Name: sentiment_dummies, dtype: int64


In [24]:
#manually testing reliability of the different measures with a random subset
random_numbers = list(np.random.randint(0, 62415, 5))
for num, val in enumerate(food.iloc[random_numbers, :]['text']):
    print(val)
    print('\n')
    print(food.iloc[[num], :]['sentiment_dummies'].values)

RT @Wendys: The mixtape drops now. Not pulling punches. We Beefin’. https://t.co/H1Rm1ODYC4


[1]
Some businesses supporting #MarchForOurLives through either donations, discounts or percentage of proceeds, accordi… https://t.co/8xDcgzMQjW


[0]
RT @DennysDiner: a denny’s haiku 

if you need a bath,
but you’ve got no hot water,
just use warm gravy.


[-1]
@TMobile Tuesdays been slacking. What happened to my free @lyft credits and @dunkindonuts credit i dont want no free t-mobile umbrella fam


[0]
@Starbucks @jeonggukupdates @SugasHero Omg. Is this real HAHAHA I LOVE THIS


[1]


In [25]:
#function to quickly separate positive/negative tweets by company

def negativity_formatter(company, hourly_rate=False):
    """This function accepts a company and returns either 
    the separate company, positive, negative dataframes (in that order)
    OR the same order plus a dataframe of hourly rates, if hourly_rate = True"""
    df = food.loc[food['name'] == company]
    positive_df = df.loc[df['sentiment_dummies'] == 1]
    negative_df = df.loc[df['sentiment_dummies'] == -1]
    if not hourly_rate:
        return df, positive_df, negative_df
    else:
        rate_df = pd.DataFrame()
        rate_df['pos_count'] = positive_df.groupby(positive_df['time_tweeted'].dt.hour)['Company'].count()
        rate_df['neg_count'] = negative_df.groupby(negative_df['time_tweeted'].dt.hour)['Company'].count()
        rate_df['rate'] = rate_df['neg_count'] / (rate_df['pos_count'] + rate_df['neg_count'])
        return rate_df
        

In [26]:
#43% of tweets are retweets
food.loc[food['text'].str[:2] == 'RT'].shape[0] / food.shape[0]

0.4290655516815069

In [27]:
#Creating a dummy for whether the tweet is a retweet or not
retweets = []
for val in food['text']:
    if val[:2] == 'RT':
        retweets.append(1)
    else: 
        retweets.append(0)
food['is_a_retweet'] = retweets

In [28]:
#getting rid of tweets without a user 
food = food.drop(list(food.loc[(food['user_name'].isnull())].index), axis=0)

In [29]:
#dropping regional affiliates
associated_comps = []
names = []
for num, name in enumerate(food['user_name']):
    test = re.findall(r'\A(mcdon|dunki|redro|starbuc|shakesh|domino|crackerb)[a-z]+'
                      , name, flags=re.IGNORECASE)
    if test:
        if name not in ['Dunkin Fails', 'Dunkin Kitti', "McDonald's employee"]:
            associated_comps.append(num)
            names.append(name)
            
food = food.drop(associated_comps, axis=0)

## Beginning to look at user profiles by company

In [30]:
#customer profiles still skewed by large means- looking at medians & means
#creating a subset to examine the numeric characteristics of each company's customers
individual_users = food.drop_duplicates(subset='user_name')

customer_numeric_df = individual_users.groupby('name').agg({'favorite_count': ['mean', 'max'], 
                                      'number_of_people_they_follow': ['median', 'mean'],
                                      'number_of_user_tweets': ['median', 'mean'],
                                      'retweet_count': ['median', 'mean'],
                                      'user_followers_count': ['median', 'mean'],
                                       'user_is_verified': ['mean'],
                                      'sentiment_score': ['mean'], 
                                      'is_a_retweet': 'mean',
                                      'Company': 'count'})

customer_numeric_df.columns = [' '.join(col).strip() for col in customer_numeric_df.columns.values]

In [31]:
#Not enough observations for Potbelly (684), RedRobin (1463), dropping
food = food.drop(list(food.loc[(food['Company']== '@redrobinburgers')|
                   (food['Company']== '@Potbelly')].index), axis=0)
food = food.reset_index(drop=True)

In [32]:
#MISSING VALUES FOR CERTAIN COMPANIES AT CERTAIN HOURS- hourly is flawed
# negativity_by_comp = pd.DataFrame()
# for val in food.name.unique().tolist():
#     if val != 'McDonalds':
#         values = list(negativity_formatter(val, hourly_rate=True)['rate'].values)
#         negativity_by_comp[val] = values

In [33]:
food['day_date'] = food['time_tweeted'].dt.day
food['weekday'] = food['time_tweeted'].dt.weekday

## Beginning look at stock movements and sentiment

In [34]:
#specifying trading days with sufficient information range
stock_analysis = food.loc[(food['time_tweeted'] > pd.to_datetime('2018.03.11')) & (food['time_tweeted'] < pd.to_datetime('2018.03.31'))]

In [35]:
stock_analysis.shape

(66552, 21)

In [36]:
stocks = pd.read_csv('./twitter_stocks.csv')

In [37]:
stocks.drop(['High', 'Low', 'Adj Close'], axis=1, inplace=True)

In [38]:
stocks['change'] = stocks['Close'] - stocks['Open']

In [39]:
#creating datetime, and locating common trading dates
stocks['Date'] = pd.to_datetime(stocks.Date)
stocks['day'] = stocks['Date'].dt.day

In [40]:
stocks = stocks.loc[(stocks['day'] > 11) & (stocks['day'] < 31)]

In [41]:
#making sure values match before merging
stocks['Name'] = stocks.Name.str.replace('Shack Shack', 'Shake_Shack')
stocks['Name'] = stocks.Name.str.replace('Cracker Barrel', 'Cracker_Barrel')
stocks['Name'] = stocks.Name.str.replace('Dunkin Donuts', 'Dunkin_Donuts')

In [42]:
#grouping sentiments to merge
grouped_analysis = stock_analysis.groupby(['day_date', 'name'], as_index=False).agg(
                                                    {'sentiment_score':'mean',
                                                       'Company':'count'})

grouped_analysis.columns =  ['day_date', 'name', 'sentiment_score', 'num_observations']

In [43]:
merged_stock = pd.merge(grouped_analysis, stocks, left_on=['day_date', 'name'], right_on=['day', 'Name'])

In [44]:
#dropping redundant columns 
merged_stock.drop(['day', 'Name'], axis=1, inplace=True)

In [45]:
merged_stock = merged_stock.sort_values(['name', 'day_date'])

In [46]:
#correlations between sentiment_score and various other indicators (volume/change/market_cap)
merged_stock.corr()['sentiment_score']

day_date           -0.094365
sentiment_score     1.000000
num_observations   -0.013644
Open               -0.014959
Close              -0.012543
Volume              0.145016
Market Cap          0.089305
change              0.102586
Name: sentiment_score, dtype: float64

In [47]:
#creating change in sentiment day-to-day by company
merged_stock['diff_sent'] = merged_stock.groupby(['name'])['sentiment_score'].transform(lambda x: x.diff())

#correlation in sentiment/change in price across companies 
#this run for change in sentiment/price (with no shift) was insignificant
correlations = []
for comp in merged_stock.name.unique().tolist():
    x = merged_stock.loc[merged_stock['name'] == comp]
    correlations.append(x.corr().loc['sentiment_score', 'Volume'])
np.mean(correlations)

-0.0412179413740274

In [48]:
#same as above with different lagged sentiments
#sentiment lagged 1 day
merged_stock['diff_sent'] = merged_stock.groupby(['name'])['sentiment_score'].transform(lambda x: x.diff()).shift(-1)

#day to day correlation in change in sentiment/price across companies 
correlations = []
for comp in merged_stock.name.unique().tolist():
    x = merged_stock.copy()
    x = x.loc[merged_stock['name'] == comp]
    x.diff_sent = x.diff_sent.shift(-1)
    x = x[:-1]
    correlations.append(x.corr().loc['diff_sent', 'change'])

print('sentiment correlations lagged one day:', np.mean(correlations))

sentiment correlations lagged one day: -0.05092106518166626


In [49]:
#same as above with different lagged sentiments
#sentiment lagged 1 day
merged_stock['diff_sent'] = merged_stock.groupby(['name'])['sentiment_score'].transform(lambda x: x.diff())

#day to day correlation in change in sentiment/price across companies 
correlations = []

for comp in merged_stock.name.unique().tolist():
    x = merged_stock.copy()
    x = x.loc[merged_stock['name'] == comp]
    x.diff_sent = x.diff_sent.shift(1)
    x = x[1:]
    correlations.append(x.corr().loc['diff_sent', 'change'])

print('stock changes lagged one day correlation', np.mean(correlations))

stock changes lagged one day correlation 0.14710532159554887


In [50]:
#how many mentions of stock or market there are
stock = []
market = []
for ind, val in enumerate(stock_analysis.text_sentiment):
    x = re.search(r'(stock)', val, re.IGNORECASE)
    y = re.search(r'(market)',val, re.IGNORECASE)
    if x:
        stock.append(ind)
    if y:
        market.append(ind)
print(len(set(stock + market)))

184


In [51]:
merged_stock.columns

Index(['day_date', 'name', 'sentiment_score', 'num_observations', 'Date',
       'Open', 'Close', 'Volume', 'Market Cap', 'change', 'diff_sent'],
      dtype='object')

In [52]:
#permutation from direct correlation w/o lag, not change in sentiment
x = merged_stock['sentiment_score']
y = merged_stock['change']

correlations = np.empty(1000)

for i in range(1000):
    corrs = []
    for comp in merged_stock.name.unique().tolist():
        x = merged_stock.copy()
        x = x.loc[merged_stock['name'] == comp]
        perms = np.random.permutation(x['Volume'])
        corrs.append(stats.pearsonr(perms, x['sentiment_score'])[0])
    correlations[i] = np.mean(corrs)
    
print('p_value: ', np.sum(correlations > .0509) / len(correlations))

p_value:  0.27


In [53]:
##p-value from a permutation test for day lag 
correlations = np.empty(1000)

for i in range(1000):
    corrs = []
    for comp in merged_stock.name.unique().tolist():
        x = merged_stock.copy()
        x = x.loc[x['name'] == comp]
        x.diff_sent = x.diff_sent.shift(1)
        x = x[1:]
        corrs.append(x.corr().loc['diff_sent', 'change'])
    correlations[i] = np.mean(corrs)
    
print('p_value: ', np.sum(correlations > .149) / len(correlations))

p_value:  0.0


## Looking at Profiles of negative/positive sentiments

In [54]:
#set of english vocabulary
english_vocab = set(w.lower() for w in nltk.corpus.words.words())

In [55]:
def lang_composition(company):
    """Takes a company name and returns the total number of words present, 
    the number of unique words present, and the number of those words that are in the English
    language"""
    all_tweets, positive, negative = negativity_formatter(company)
    all_text = ' '
    pos_text = ' '
    neg_text = ' '
    for val in all_tweets.text:
        if val[:2] != 'RT':
            all_text = all_text + ' ' + val
    for val in positive.text:
        if val[:2] != 'RT':
            pos_text = pos_text + ' ' + val
    for val in negative.text:
        if val[:2] != 'RT':
            neg_text = neg_text + ' ' + val
    all_words = Text_Cleaner(all_text, tokens = True)
    pos_words = Text_Cleaner(pos_text, tokens = True)
    neg_words = Text_Cleaner(neg_text, tokens = True)
    info = [company]
    info.extend([len(all_words), len(pos_words), len(neg_words)])
    info.extend([len(set(all_words)), len(set(pos_words)), len(set(neg_words))])
    all_clean_words = [word for word in set(all_words) if word in english_vocab]
    pos_clean_words = [word for word in set(pos_words) if word in english_vocab]
    neg_clean_words = [word for word in set(neg_words) if word in english_vocab]
    info.extend([len(all_clean_words), len(pos_clean_words), len(neg_clean_words)])
    return info

In [56]:
#getting language composition for all companies
all_comps = []
for name in list(food.name.unique()):
    all_comps.append(lang_composition(name))
    
lang_comp_df = pd.DataFrame(all_comps)

In [57]:
#renaming all columns
lang_comp_df.columns = ['name', 'all_words','pos_all_words', 'neg_all_words', 'unique_words', 'pos_unique_words', 'neg_unique_words', 
                        'english_words', 'pos_english_words', 'neg_english_words']

In [58]:
#creating percentages for analysis
lang_comp_df['percent_unique'] = lang_comp_df['unique_words'] / lang_comp_df['all_words'] * 100
lang_comp_df['percent_english'] = lang_comp_df['english_words'] / lang_comp_df['unique_words'] * 100
lang_comp_df['pos_percent_unique'] = lang_comp_df['pos_unique_words'] / lang_comp_df['pos_all_words'] * 100
lang_comp_df['pos_percent_english'] = lang_comp_df['pos_english_words'] / lang_comp_df['pos_unique_words'] * 100
lang_comp_df['neg_percent_unique'] = lang_comp_df['neg_unique_words'] / lang_comp_df['neg_all_words'] * 100
lang_comp_df['neg_percent_english'] = lang_comp_df['neg_english_words'] / lang_comp_df['neg_unique_words'] * 100

In [59]:
#Merging sentiments 
lang_comp_df = pd.merge(lang_comp_df, food.groupby('name', as_index=False)['sentiment_score'].mean(), 
       on='name')

#Merging only negative sentiments 
food_neg = food.loc[food['sentiment_dummies'] == -1].groupby('name',as_index=False)['sentiment_score'].mean()
food_neg.columns = ['name', 'neg_sentiment']
lang_comp_df = pd.merge(lang_comp_df, food_neg, on='name')

#Merging only positive sentiments
food_pos = food.loc[food['sentiment_dummies'] == 1].groupby('name',as_index=False)['sentiment_score'].mean()
food_neg.columns = ['name', 'pos_sentiment']
lang_comp_df = pd.merge(lang_comp_df, food_pos, on='name')

In [60]:
#strongest correlations with overall sentiment- percent unique (.31), percent English (0.29)
lang_comp_df.corr()['sentiment_score_x']

all_words             -0.555627
pos_all_words         -0.425108
neg_all_words         -0.668694
unique_words          -0.489475
pos_unique_words      -0.383168
neg_unique_words      -0.607556
english_words         -0.442953
pos_english_words     -0.356219
neg_english_words     -0.548201
percent_unique         0.289452
percent_english        0.498901
pos_percent_unique     0.121678
pos_percent_english    0.354084
neg_percent_unique     0.548569
neg_percent_english    0.714317
sentiment_score_x      1.000000
neg_sentiment          0.784287
sentiment_score_y      0.635131
Name: sentiment_score_x, dtype: float64

## Looking at company tweets

In [61]:
company_tweets = pd.read_csv('./company_tweets.csv', index_col=0)

In [62]:
company_tweets.shape

(19760, 11)

In [63]:
company_tweets['time_tweeted'] = pd.to_datetime(company_tweets['time_tweeted'])

In [64]:
#there are no duplicates
company_tweets.unique_code.drop_duplicates(inplace=True)

In [65]:
company_tweets.shape

(19760, 11)

In [66]:
#dropping potbelly & redrobin
company_tweets.drop(list(company_tweets.loc[(company_tweets['name'] == '@redrobinburgers')
                  |(company_tweets['name'] == '@Potbelly')].index), axis=0,
                   inplace=True)

In [67]:
#getting normally spelled names
mapper = {'@DennysDiner': 'Dennys', '@ChipotleTweets': 'Chipotle',
         '@McDonalds': 'McDonalds', '@Wendys': 'Wendys', '@Starbucks':'Starbucks',
         '@dunkindonuts':'Dunkin_Donuts', '@dominos': 'Dominos', '@shakeshack': 'Shake_Shack',
         '@sonicdrivein': 'Sonic', '@wingstop': 'Wingstop', '@CrackerBarrel': 'Cracker_Barrel', 
         '@redrobinburgers': 'Red_Robin', '@Potbelly': 'Potbelly'}

company_tweets['Company'] = company_tweets.name.map(mapper)

In [68]:
food.time_tweeted.max()

Timestamp('2018-04-19 16:48:37')

In [69]:
company_tweets.reset_index(drop=True, inplace=True)

In [70]:
#dropping all company tweets that occurred after consumer tweet collection stopped
company_tweets.drop(list(
    company_tweets.loc[company_tweets['time_tweeted'] > food.time_tweeted.max()].index), axis=0, inplace=True)

In [71]:
print(company_tweets.shape)
company_tweets = company_tweets.reset_index(drop=True)

(16720, 12)


In [72]:
#dropping values that are before consumer observations
company_tweets.drop(list(company_tweets.loc[company_tweets['time_tweeted'] < 
                                       food.time_tweeted.min()].index), inplace=True)

In [73]:
company_tweets = company_tweets.reset_index(drop=True)

In [74]:
#Getting the hashtags for each company
hashtags = {}
callnames = {}
for company in tqdm(list(company_tweets.Company.unique())): 
    df = company_tweets.loc[company_tweets['Company'] == company]
    hash_base = []
    call_base = []
    for val in df['text']:
        hashes = re.findall(r'#[A-Za-z0-9]+\b', val)
        calls = re.findall(r'@[A-Za-z0-9]+\b', val)
        hash_base.extend(hashes)
        call_base.extend(calls)
    hashtags[company] = hash_base
    callnames[company] = call_base

100%|██████████| 11/11 [00:00<00:00, 167.51it/s]


In [75]:
tokenizer = RegexpTokenizer(r'#[A-Za-z0-9]+\b')
#Only interested in customers independently using the hashtags
hash_check = food.copy()
hash_check = hash_check.loc[food['is_a_retweet'] == 0]
hash_check['text'] = hash_check.text.apply(lambda x: tokenizer.tokenize(x))

In [76]:
#Identifying customer use of company hashtags

company_counts = {}
for company in list(company_tweets.Company.unique()):
    hash_comp = hash_check.loc[hash_check['name'] == company]
    hashes = {}
    for val in hash_comp['text']:
        for word in val:
            if word in hashtags[company]:
                try: 
                    hashes[word] += 1
                except:
                    hashes[word] = 1
    company_counts[company] = hashes

In [77]:
#putting statistics on hashtags/use into new company profile dataframe
new_df = pd.DataFrame(company_counts).T.sum(axis=1).reset_index()
new_df.columns = ['name', 'customer_hash_use']
new_df['customer_unique_hashes'] = pd.DataFrame(company_counts).T.count(axis=1).values

In [78]:
#examining company specific twitter behavior
comp_hash_uses = []
comp_unique_hashes = []
comp_handle_uses = []
comp_unique_handle_uses = []
for company in new_df.name.tolist():
    comp_hash_uses.append(len(hashtags[company]))
    comp_unique_hashes.append(len(set(hashtags[company])))
    comp_handle_uses.append(len(callnames[company]))
    comp_unique_handle_uses.append(len(set(callnames[company])))

new_df['comp_hash_uses'] = comp_hash_uses
new_df['comp_unique_hashes'] = comp_unique_hashes
new_df['comp_handle_uses'] = comp_handle_uses
new_df['comp_unique_handle_uses'] = comp_unique_handle_uses

In [79]:
#grouping company tweet info
comp_grouped = company_tweets.groupby('Company', as_index=False).mean().drop(
                            ['is_a_retweet', 'is_quote_status', 'unique_code'], axis=1)

In [80]:
comp_merged = pd.merge(comp_grouped, new_df, left_on='Company', right_on='name')
comp_merged.drop('name', axis=1, inplace=True)

In [81]:
comp_merged['market_cap'] = merged_stock.groupby('name')['Market Cap'].mean().values

In [82]:
comp_merged['sentiment'] = food.groupby('name')['sentiment_score'].mean().values

In [83]:
#Percentage of tweets that are direct customer contact
customer_contact = []
for val in company_tweets['text']:
    if val[0] == '@':
        customer_contact.append(1)
    else:
        customer_contact.append(0)
company_tweets['is_contact'] = customer_contact

company_tweets.groupby('name')['is_contact'].mean()

name
@ChipotleTweets    0.999342
@CrackerBarrel     0.769536
@DennysDiner       0.218382
@McDonalds         0.997368
@Starbucks         0.974342
@Wendys            0.997368
@dominos           0.990789
@dunkindonuts      0.971711
@shakeshack        0.959211
@sonicdrivein      0.959211
@wingstop          0.984868
Name: is_contact, dtype: float64

In [84]:
company_tweets['is_contact'].mean()

0.9031160784790135

In [85]:
#adding a tweets per follower category to account for size
comp_merged['percent_tweets_foll'] = comp_merged['number_of_tweets_total'] / comp_merged['company_followers_count']

In [86]:
comp_merged.drop('company_followers_count', axis=1, inplace=True)
print('Sentiment Correlations')
comp_merged.corr()['sentiment'].sort_values()[:-1]

Sentiment Correlations


retweet_count             -0.240913
comp_unique_handle_uses   -0.237383
market_cap                -0.216188
number_of_tweets_total    -0.126253
followers_count           -0.096115
comp_hash_uses            -0.012132
customer_unique_hashes     0.372163
favorite_count             0.375593
comp_handle_uses           0.391963
customer_hash_use          0.392710
comp_unique_hashes         0.395412
percent_tweets_foll        0.441691
Name: sentiment, dtype: float64

## Day of week optimizer

In [87]:
#ensuring that every weekday has equal representation (Friday will be slightly short-changed (by ~3 hours)
day_week = food.loc[(food['time_tweeted'] > '2018-03-09')]

In [88]:
#converting day nums to names, grouping by company and day of week and aggregating. 
mapper = {0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
day_week = day_week.groupby(['name', 'weekday'], as_index=False).agg({'sentiment_score':'mean', 'user_name': 'count',
                                                          'retweet_count':'mean', 'favorite_count':'mean'})

day_week['weekday'] = day_week['weekday'].map(mapper)

In [89]:
#including average sentiment for visualization
merge = day_week.groupby('weekday', as_index=False)['sentiment_score'].mean()
merge.columns = ['weekday', 'avg_sentiment']
day_week = pd.merge(day_week, merge, on='weekday')

In [90]:
print('number of tweets by day')
day_week.groupby('weekday')['user_name'].sum().sort_values()

number of tweets by day


weekday
Sunday       15335
Monday       18728
Saturday     19039
Friday       20065
Wednesday    20959
Tuesday      22902
Thursday     23041
Name: user_name, dtype: int64

In [91]:
print('average sentiment by day')
day_week.groupby('weekday')['sentiment_score'].mean().sort_values()

average sentiment by day


weekday
Saturday     0.081780
Sunday       0.097866
Friday       0.100423
Monday       0.116630
Thursday     0.125735
Wednesday    0.129243
Tuesday      0.139522
Name: sentiment_score, dtype: float64

## Beginning profile analysis

In [92]:
profiles = food.dropna(subset=['user_profile_text']).reset_index(drop=True)

In [93]:
profiles.shape

(119113, 21)

In [94]:
#only interested in unique profiles
profiles = profiles.drop_duplicates(subset='user_name').reset_index(drop=True)

In [95]:
profiles.shape

(82276, 21)

In [96]:
#dropping columns I don't need for this
profiles.drop(['user_coordinates', 'unique_code', 'day_date', 'text_sentiment'], axis=1, inplace=True)

In [97]:
#adding an hour dummy
profiles['hour'] = profiles.time_tweeted.dt.hour

In [98]:
#getting rid of columns I don't need anymore
profiles = profiles.drop(['time_tweeted', 'Company'], axis=1)

In [99]:
#Maybe the number of hashtags a person uses is indicative of their personality/disposition towards the product
def num_hashes(text):
    """Counts the number of hashtags in the profile"""
    text = text.split()
    num_hashes = 0
    for word in text:
        if word[0] == '#':
            num_hashes = num_hashes + 1
    return num_hashes

profiles['number_of_hashes'] = profiles['user_profile_text'].apply(num_hashes)

In [100]:
#8% of users use a hashtag in their profile
np.sum(profiles.number_of_hashes > 1) / profiles.shape[0]

0.08159122952015169

In [101]:
#adding a dummy variable if the user lists a location, possibly indicator
profiles['lists_location'] = profiles.user_location.isnull().astype(int)

#unreliable for now to include
profiles = profiles.drop('user_location', axis=1)

In [102]:
def cleaning_additional(text):
    """standardizing contractions, removing urls, removing retweet indicators, removing handles"""
    text = re.sub(r'https[\S]+', ' ', text)
    text = re.sub(r'can\'t', 'can not', text, flags=re.IGNORECASE)
    text = re.sub(r'don\'t', 'do not', text, flags=re.IGNORECASE)
    text = re.sub(r'isn\'t', 'is not', text, flags=re.IGNORECASE)
    text = re.sub(r'aren\'t', 'are not', text, flags=re.IGNORECASE)
    text = re.sub(r'wasn\'t', 'was not', text, flags=re.IGNORECASE)
    text = re.sub(r'weren\'t', 'were not', text, flags=re.IGNORECASE)
    text = re.sub(r'haven\'t', 'have not', text, flags=re.IGNORECASE)
    text = re.sub(r'\b(rt|RT)', ' ', text)
    text = re.sub(r'@[\S]+', ' ', text)
    text = re.sub(r'[0-9]*[a-zA-Z]+[0-9]+[a-zA-Z]*[0-9]*[a-zA-Z]*', ' ', text)
    return text

profiles['cleaned'] = profiles.user_profile_text.apply(cleaning_additional)

In [103]:
def punctuation_cleaner(text):
    """Removes punctuation and places spaces"""
    text = re.sub(r'\\n', ' ', text)      
    text = re.sub(r'[!|?|.|,|(|)|||[|]|/|\\|-]', ' ', text)
    return text

profiles['cleaned'] = profiles.cleaned.apply(punctuation_cleaner)

In [104]:
def split_hashes(text):
    """Turns camel case hashtags into separate readable words"""
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    split2 = []
    for val in words:
        if val[0] == '#':
            words = ' '.join(re.findall('[A-Z][^A-Z]*', val))
            split2.append(words)
        else:
            split2.append(val)
    return ' '.join(split2)

profiles['without_hashes'] = profiles.cleaned.apply(split_hashes)

### T-test for mean difference between sentiment with/without retweets

In [105]:
full = [0.065029, 0.094908, 0.097293, 0.101714, 0.106599, 0.108249, 0.113954, 0.121663, 0.147069, 0.153147, 0.162750, 0.173199, 0.184626]
no_retweet = [0.089,0.132,0.072,0.0439,0.089,0.055887,0.124,0.089954,0.111304,0.134573,0.084565]

stats.ttest_ind(full, no_retweet)

Ttest_indResult(statistic=2.3715728048141727, pvalue=0.026886738879467794)

## Looking at possible occupational/regional tendencies with a noun phrase function. 

In [106]:
#words that aren't caught that don't add value to function below
drop_words = ['love','don t', 'twitter', 'don', 'tweets', 'opinions', 'your',
             'born', 'never']

In [107]:
def Profile_Noun_Finder(text):
    """Takes text, eliminates non-alpha characters, and returns all nouns"""
    text = re.sub(r'(https)[^\s]+', '', text)
    text = re.sub(r'[^a-zA-Z]+', ' ', text)
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text)
    lowered = [word.lower() for word in words]
    words = [word for word in lowered if len(word) > 2]
    words = [word for word in words if word not in drop_words]
    new_text = ' '.join(words)
    wiki = TextBlob(new_text)
    return list(wiki.noun_phrases)

## Looking at data with/without verified users

In [108]:
profiles.columns

Index(['favorite_count', 'number_of_people_they_follow',
       'number_of_user_tweets', 'retweet_count', 'text',
       'user_followers_count', 'user_is_verified', 'user_name',
       'user_profile_text', 'name', 'sentiment_score', 'sentiment_dummies',
       'is_a_retweet', 'weekday', 'hour', 'number_of_hashes', 'lists_location',
       'cleaned', 'without_hashes'],
      dtype='object')

In [109]:
without = profiles.copy()
without = without.loc[without['user_is_verified'] == 0]

In [110]:
with_ver = profiles.copy()
with_ver = with_ver.loc[with_ver['user_is_verified'] == 1]

In [111]:
without.shape

(80638, 19)

In [112]:
with_ver.shape
852 / (39630 + 852)

0.02104639098858752

In [113]:
with_ver_x = with_ver.groupby('name').agg({'user_name':'count', 'sentiment_score':'mean'})
without_x = without.groupby('name').agg({'user_name':'count', 'sentiment_score':'mean'})
print('Verified:\n',with_ver_x['sentiment_score'].mean(),'\n', with_ver_x)
print('\nUnverified:\n',without_x['sentiment_score'].mean(), '\n', without_x)

Verified:
 0.13658429972302708 
                 user_name  sentiment_score
name                                      
Chipotle              175         0.161718
Cracker_Barrel         84         0.126873
Dennys                 56         0.132770
Dominos               162         0.117590
Dunkin_Donuts         297         0.118739
McDonalds             188         0.160339
Shake_Shack           220         0.140338
Sonic                 111         0.121065
Starbucks             196         0.163859
Wendys                102         0.138000
Wingstop               47         0.121135

Unverified:
 0.11665966878234232 
                 user_name  sentiment_score
name                                      
Chipotle             8708         0.157244
Cracker_Barrel       3323         0.147277
Dennys               9157         0.095777
Dominos              8559         0.062175
Dunkin_Donuts        8676         0.099223
McDonalds            9601         0.084571
Shake_Shack          4242   

In [114]:
with_ver_x['difference'] = with_ver_x['sentiment_score'] - without_x['sentiment_score']
with_ver_x

Unnamed: 0_level_0,user_name,sentiment_score,difference
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chipotle,175,0.161718,0.004475
Cracker_Barrel,84,0.126873,-0.020403
Dennys,56,0.13277,0.036992
Dominos,162,0.11759,0.055415
Dunkin_Donuts,297,0.118739,0.019516
McDonalds,188,0.160339,0.075768
Shake_Shack,220,0.140338,-0.045877
Sonic,111,0.121065,0.018206
Starbucks,196,0.163859,0.041198
Wendys,102,0.138,0.025643


In [117]:
# food.to_csv('./complete_data_models.csv')