# Final Project

In [13]:
# Import statements
import os
import re
import nltk
import pandas as pd
import numpy as np
from afinn import Afinn

In [None]:
stopwords = nltk.corpus.stopwords.words("english")

# Clean tweets for sentiment analysis
def get_clean_words(words):
    def _isnum(w):
        try:
            int(w)
            return True
        except ValueError:
            return False
        
    # Set words to lowercase and remove them if they are stop words
    words = [w.lower() for w in re.findall('\w+', markup_text) if w.lower() not in stopwords]

    # Remove numbers
    words = [w for w in words if not _isnum(w)]
        
    # Remove hashtags
    words = [w for w in words if !w.startswith("#")]
    
    # Remove mentions
    words = [w for w in words if !w.startswith("@")]

    return words

In [None]:
def create_df(c):
    df = pd.DataFrame()
    divider = np.zeros(10)
    counter = 0
    for f in os.listdir("tweets/%s" % c):
        dft = pd.read_csv("tweets/%s/%s" % (c, f))
        divider[counter] = len(dft) - 1
        df = pd.concat([df, dft])
    return df, divider

In [16]:
categories = ["fast food", "airlines", "leagues", "colleges", "streaming", "news", "tech giants", "singers", "actors"]

fast_food, ff_divider = create_df(categories[0])
airlines, al_divider = create_df(categories[1])
leagues, lg_divider = create_df(categories[2])
colleges, cl_divider = create_df(categories[3])
streaming, st_divider = create_df(categories[4])
news, nw_divider = create_df(categories[5])
tech, tg_divider = create_df(categories[6])
singers, si_divider = create_df(categories[7])
actors, ac_divider = create_df(categories[8])

            

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [None]:
ff_tweets = list(fast_food['text'])
al_tweets = list(airlines['text'])
lg_tweets = list(leagues['text'])
cl_tweets = list(colleges['text'])
st_tweets = list(streaming['text'])
nw_tweets = list(news['text'])
tg_tweets = list(tech['text'])
si_tweets = list(singers['text'])
ac_tweets = list(actors['text'])

ff_clean = [get_clean_words(w) for w in ff_tweets]
al_clean = [get_clean_words(w) for w in al_tweets]
lg_clean = [get_clean_words(w) for w in lg_tweets]
cl_clean = [get_clean_words(w) for w in cl_tweets]
st_clean = [get_clean_words(w) for w in st_tweets]
nw_clean = [get_clean_words(w) for w in nw_tweets]
tg_clean = [get_clean_words(w) for w in tg_tweets]
si_clean = [get_clean_words(w) for w in si_tweets]
ac_clean = [get_clean_words(w) for w in ac_tweets]

In [None]:
afinn = Afinn()

def get_affinity_scores(tweets):
    scores = []
    for t in tweets:
        t_s = ' '.join(t)
        scores.append(afinn.score(t_s) / len(t))
    return scores

In [None]:
ff_affin = get_affinity_scores(ff_clean)
al_affin = get_affinity_scores(al_clean)
lg_affin = get_affinity_scores(lg_clean)
cl_affin = get_affinity_scores(cl_clean)
st_affin = get_affinity_scores(st_clean)
nw_affin = get_affinity_scores(nw_clean)
tg_affin = get_affinity_scores(tg_clean)
si_affin = get_affinity_scores(si_clean)
ac_affin = get_affinity_scores(ac_clean)

In [None]:
def affin_by_acct(tweets, divider):
    acct = []
    start = 0
    for x in range(10):
        acct.append(get_affinity_scores(tweets[start:divider[x]]))
        start = divider[x] + 1
    return acct

In [None]:
ff_acct = affin_by_acct(ff_clean, ff_divider)
al_acct = affin_by_acct(al_clean, al_divider)
lg_acct = affin_by_acct(lg_clean, lg_divider)
cl_acct = affin_by_acct(cl_clean, cl_divider)
st_acct = affin_by_acct(st_clean, st_divider)
nw_acct = affin_by_acct(nw_clean, nw_divider)
tg_acct = affin_by_acct(tg_clean, tg_divider)
si_acct = affin_by_acct(si_clean, si_divider)
ac_acct = affin_by_acct(ac_clean, ac_divider)