In [1]:
import pandas as pd

import re
from emoji import UNICODE_EMOJI
from textblob import TextBlob
import altair as alt
import numpy as np
from collections import Counter
import string

import nltk
nltk.download('vader_lexicon')
nltk.download('brown')
nltk.download('punkt')
nltk.download('stopwords')

from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /home/jovyan/nltk_data...
[nltk_data] Downloading package brown to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.
[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


### The data cleaning/manipulation technique/functions

In [2]:
def extract_tags(text):
    return re.findall("#([a-zA-Z0-9_]{1,50})", text)

    # will return a list of tags

In [3]:
def extract_emoji(text):
    return [ch for ch in text if ch in UNICODE_EMOJI['en']]

    # will return a list of emojis

In [4]:
# remove url, mentions, tags, punctuation, numbers, emojis
# we need clean text to extract words & perform sentiment analysis

def clean_tweet(txt): 
    temp = re.sub("@[A-Za-z0-9_]+","", txt)
    temp1 = re.sub("#[A-Za-z0-9_]+","", temp)
    temp2 = re.sub(r"http\S+", "", temp1)
    
    result=''.join(i for i in temp2.lower() if (i.isalpha() or i==' '))
    return result

    # return the cleaned tweet in lower case

In [5]:
def word_list(tweet):
    
    lst = word_tokenize(tweet)
    lst1 = []
    stops = list(stopwords.words('english'))
    for w in lst:
        if w not in stops:
            lst1.append(w)
    
    return lst1

    # return a list of words (without stopwords)

In [6]:
def sentiment(tweet):
    blob = TextBlob(tweet)
    
    return blob.sentiment.polarity

    # return the sentiment score

In [7]:
def get_date(date):
    
    return date[:10]

In [8]:
def get_hour(date):
    
    return date[11:13]


In [9]:
def get_10min(date):
    
    return date[14]+'0'

    # if we want to do in-depth analysis on certain day, this might come in handy

In [10]:
def get_min(date):
    
    return date[14:16]

    # if we want to do in-depth analysis on certain day, this might come in handy 

In [11]:
# the below 5 functions add a sentiment score label to each tweet entry
# 5 different labels: firm positive, positive, neutral, negative, firm negative

def firm_pos(score):
    if score >= 0.7:
        return 1
    else: return 0

In [12]:
def pos(score):
    if (score >= 0.25) & (score < 0.7):
        return 1
    else: return 0

In [13]:
def neutral(score):
    if (score >= -0.25) & (score < 0.25):
        return 1
    else: return 0

In [14]:
def neg(score):
    if (score > -0.7) & (score < -0.25):
        return 1
    else: return 0

In [15]:
def firm_neg(score):
    if score <= -0.7:
        return 1
    else: return 0

### **Import data, then check duplicate and missing value.**

In [16]:
df = pd.read_csv('Project Data/Kershaw 2017 WS.csv')
df['id'].duplicated(keep='last').sum()

0

In [17]:
df.isnull().sum()

id      0
date    3
text    3
dtype: int64

### **If problem exists, remove duplicate and drop nan rows**

In [18]:
df.shape

(78149, 3)

In [19]:
df = df.dropna(how='any').reset_index()
df.drop(columns=['index'],inplace=True)
df.shape

(78146, 3)

### Apply the techniques on the data, after that we'll have the used words, tags, emojis, sentiment score & label, and specific date/hour/min data. 

In [20]:
df['tags']= df.apply(lambda row: extract_tags(row['text']), axis=1)
df['emojis']= df.apply(lambda row: extract_emoji(row['text']), axis=1)
df['clean_text']= df.apply(lambda row: clean_tweet(row['text']), axis=1)
df['words']= df.apply(lambda row: word_list(row['clean_text']), axis=1)
df['sentiment_score']= df.apply(lambda row: sentiment(row['clean_text']), axis=1)
df['day']= df.apply(lambda row: get_date(row['date']), axis=1)
df['hour']= df.apply(lambda row: get_hour(row['date']), axis=1)
df['10min']= df.apply(lambda row: get_10min(row['date']), axis=1)
df['min']= df.apply(lambda row: get_min(row['date']), axis=1)
df['POS']= df.apply(lambda row: firm_pos(row['sentiment_score']), axis=1)
df['pos']= df.apply(lambda row: pos(row['sentiment_score']), axis=1)
df['neu']= df.apply(lambda row: neutral(row['sentiment_score']), axis=1)
df['neg']= df.apply(lambda row: neg(row['sentiment_score']), axis=1)
df['NEG']= df.apply(lambda row: firm_neg(row['sentiment_score']), axis=1)

df.head()

Unnamed: 0,id,date,text,tags,emojis,clean_text,words,sentiment_score,day,hour,10min,min,POS,pos,neu,neg,NEG
0,922975991626063872,2017-10-24 23:59:59+00:00,@EmresBrylcreem @aj_joven I can't believe we'r...,[],[],i cant believe were facing kershaw in game o...,"[cant, believe, facing, kershaw, game, one, im...",0.2,2017-10-24,23,50,59,0,0,1,0,0
1,922975974605643777,2017-10-24 23:59:55+00:00,That was the best national anthem I have ever ...,[],[😄],that was the best national anthem i have ever ...,"[best, national, anthem, ever, heard, watch, c...",0.4,2017-10-24,23,50,59,0,1,0,0,0
2,922975971963232257,2017-10-24 23:59:54+00:00,"Been a Puig fan since day 1, love Kershaw &amp...",[WorldSeries],[],been a puig fan since day love kershaw amp ja...,"[puig, fan, since, day, love, kershaw, amp, ja...",0.5,2017-10-24,23,50,59,0,1,0,0,0
3,922975956645568512,2017-10-24 23:59:50+00:00,So ecstatic in finally seeing Clayton Kershaw ...,"[ThisTeam, WorldSeries]",[],so ecstatic in finally seeing clayton kershaw ...,"[ecstatic, finally, seeing, clayton, kershaw, ...",-0.2,2017-10-24,23,50,59,0,0,1,0,0
4,922975956020576256,2017-10-24 23:59:50+00:00,#MLB\n\nWorld Series - Game 1\n[1902] 1H Los A...,[MLB],[],world series game h los angeles dodgers d ...,"[world, series, game, h, los, angeles, dodgers...",-0.4,2017-10-24,23,50,59,0,0,0,1,0
