See https://github.com/rl3279/twitter_sentiment_analysis for organized code. 

Main code used in this notebook is distributed into two files:
- [`preprocessing.py`](https://github.com/rl3279/twitter_sentiment_analysis/blob/main/preprocessing.py), containing all scripts to clean and to process raw tweeter text string. 
- [`feature_engineering.py`](https://github.com/rl3279/twitter_sentiment_analysis/blob/main/feature_engineering.py), containing scripts to extract features from the processed text.

For reference, the exact content of these files will be attached at the very end of this notebook.

In [1]:
import my_globals
from preprocessing import cleaning, preprocess_pipeline
import feature_engineering as fe
import pandas as pd
import numpy as np

This notebook will only showcase the cleaning, preprocessing, feature-engineering workflow on a subset of data, since the resulting dataset is enormous.

In [3]:
from utils import get_sub_dataset
DATA_PATH = "/".join([my_globals.DATA_DIR, my_globals.MAIN_DATA_NAME])

get_sub_dataset(size = 5000, random_seed=42)
data = pd.read_csv(
    "/".join([my_globals.DATA_DIR, "twitter_seed42.csv"]),
    encoding = "latin1",
    header = 0, 
)


data = cleaning(data)
data["processed_text"] = data["text"].apply(preprocess_pipeline)

data["exclaim_freq"] = data["text"].apply(fe.exclaim_freq)
data["mention_count"] = data["text"].apply(fe.mention_count)
data["cap_freq"] = data["text"].apply(fe.cap_freq)
tfidf = fe.get_tfidf(data["processed_text"])
data = pd.concat([data, tfidf], axis = 1)
data



Unnamed: 0,target,ids,date,user,text,weekday_Mon,weekday_Tue,weekday_Wed,weekday_Thu,weekday_Fri,...,tok_ðº,tok_ðºðµ,tok_ð¼ð,tok_ð¼ðµð½ñ,tok_ð¼ð½ðµ,tok_ð½ð,tok_ð½ð³ñ,tok_ð½ñ,tok_ð¾ð,tok_ð¾ñ
0,4,2175288554,Mon Jun 15 00:34:44 PDT 2009,Fatimaa14,Goodnight tweeets its too late but i waas lis...,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,1771159237,Mon May 11 23:23:24 PDT 2009,MissKRYP2NT,ps. @pumpkyn once again u r kickin my ass w th...,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0,1574060114,Tue Apr 21 04:05:03 PDT 2009,JBear2978,My heart is in Maine,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0,2064836548,Sun Jun 07 07:17:20 PDT 2009,raine_angel,5 days on antibiotics and my pain is still a 10,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,4,2065993936,Sun Jun 07 09:41:14 PDT 2009,heathermoire,"One year ago today, I walked into the MSPCA an...",0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4,1793309239,Thu May 14 02:11:27 PDT 2009,twistingaether,@marcthom Good! Do you like my scarf?,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4996,0,1993677299,Mon Jun 01 10:45:32 PDT 2009,maaangelaaa,"Busy, busy, busy at work. The weekend is only ...",1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4997,0,1882235764,Fri May 22 06:29:52 PDT 2009,TeamGiles,http://yfrog.com/5au8dj oh no! Knocked over my...,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4998,4,2058500604,Sat Jun 06 15:03:58 PDT 2009,AmyJessicaB,@erica15brown ohh mann you've gotta love fall ...,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Pre-processing 
steps include:
- Link deletion
- Username mentions deletion
- Decontraction ("can't" -> "cannot")
- Lemmetization ("shoes" -> "shoe"; "mice" -> "mouse")
- Stopwords deletion (*e.g.* "a", "the", "to", "of", etc.)
- Punctuation deletion
- Digits deletion





[`preprocessing.py`](https://github.com/rl3279/twitter_sentiment_analysis/blob/main/preprocessing.py) content:

In [None]:
import my_globals
import pandas as pd
import numpy as np
from typing import List
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re
import contractions
from functools import reduce
from dateutil.parser import parse


def setup_nltk():
    """Downloads necessary packages within nltk"""
    packages = ["punkt", "wordnet", "stopwords"]
    for p in packages:
        try:
            nltk.data.find(p)
        except LookupError:
            nltk.download(p)


def tokenize(s: str, how="word_tokenize") -> List[str]:
    if how == "word_tokenize":
        return word_tokenize(s)
    elif how == "split":
        return s.split()


def del_username(s: str) -> str:
    """Delete @Username from a tweet str.

    :param s: input string
    :type s: str
    :rtype: str
    """

    return " ".join([t for t in tokenize(s, how="split") if not t.startswith("@")])


def del_punc(s: str) -> str:
    """Delete punctuations from str.

    :param s: input string
    :type s: str
    :rtype: str
    """
    punc = my_globals.PUNCS
    return "".join([w for w in s if w not in punc])


def del_link(s: str) -> str:
    """Delete links from str.

    :param s: input string
    :type s: str
    :rtype: str
    """
    r = r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)"
    return " ".join([re.sub(r, "", t) for t in tokenize(s, how="split")])


def decontract(s: str) -> str:
    """Remove contractions in text.
    e.g. I'm -> I am; she'd -> she would

    :param s: input string
    :type s: str
    :rtype: str
    """
    tokens = []
    for t in tokenize(s, how="split"):
        tokens.append(contractions.fix(t))
    return " ".join(tokens)


def del_stopwords(s: str) -> str:
    """Delete stopwords and punctuation from a string.
    Note that the type-hinting indicates that this function ought
    to be run first in the pre-processing pipeline.

    :param s: input string
    :type s: str
    """
    stop_words = set(stopwords.words('english'))

    return " ".join([t for t in tokenize(s) if t not in stop_words])


def del_digits(s: str) -> str:
    """Delete digits from str.

    :param s: input string
    :type s: str
    :rtype: str
    """
    return " ".join([w for w in tokenize(s) if not w.isdigit()])


def lemmatize(s: str) -> str:
    """Lemmatize str.

    :param s: input string
    :type s: str
    :rtype: str
    """
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(t) for t in tokenize(s)])


def preprocess_pipeline(s: str, return_lower = True) -> str:
    """Run string through all pre-processing functions.

    :param s: input string
    :type s: str
    :rtype: str
    """
    s = reduce(
        lambda value, function: function(value),
        (
            del_link,
            del_username,
            decontract,
            lemmatize,
            del_stopwords,
            del_punc,
            del_digits
        ),
        s,
    )

    return s.lower() if return_lower else s

def str_datetime(s: str):
    """Parse and format a datetime str to weekday and datetime.MAXYEAR
    
    :param s: input string containing datetime information
    :type s: str
    :rtype: tuple[str, str]
    """
    ss = parse(s).strftime('%a %Y-%m-%d %H:%M:%S')
    return ss[:3], ss[4:]


def cleaning(df: pd.DataFrame):
    """Cleaning script of the data (or subset).
    
    :param df: input dataframe.
    :type df: pd.DataFrame
    :rtype: pd.DataFrame
    """
    # Parse weekday and datetime
    weekday_datetime = pd.DataFrame(
        list(df.loc[:, "date"].apply(str_datetime)),
        columns=["weekday", "datetime"]
    )
    # One-hot encode weekday
    weekdaydummies = pd.get_dummies(
        weekday_datetime['weekday'], 
        prefix='weekday', 
        dtype=float
    )
    weekdaydummies = pd.DataFrame(
        weekdaydummies, 
        columns=['weekday_'+w for w in [
            "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"
        ]]
    )
    # Concatenate weekday dummies to other features
    weekdaydummies_datetime = pd.concat(
        [weekdaydummies, weekday_datetime['datetime']], 
        axis=1
    )
    df = pd.concat([df, weekdaydummies_datetime], axis=1)
    # Drop the column with single unique value.
    df.drop("flag", axis = 1, inplace=True)
    return df


# Feature engineering

proposed features include:
- "!" usage frequency
- "@" usage count 
- capitalized letter usage count
- TF-IDF (Term Frequency - Inverse Document Frequency) from processed text.

[`feature_engineering.py`](https://github.com/rl3279/twitter_sentiment_analysis/blob/main/feature_engineering.py) content:

In [None]:
import numpy as np
import pandas as pd
from preprocessing import preprocess_pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer


def exlaim_freq(s: str) -> float:
    """Frequency of excalamtion points in a tweet.

    :param s: input str
    :type s: str
    :rtype: float
    """
    s = "".join(s.split())
    count = sum([1 if t == "!" else 0 for t in s])
    return count / len(s)


def mention_count(s: str) -> int:
    """Counts how many mentions occurred in a tweet.

    :param s: input str
    :type s: str
    :rtype: int
    """
    count = sum([1 if t.startswith("@") else 0 for t in s.split()])
    return count


def cap_freq(s: str) -> float:
    """Frequency of capitalized letter usage in a tweet.

    :param s: input str
    :type s: str
    :rtype: float
    """
    s = preprocess_pipeline(s)
    count = sum([1 if t.isupper() else 0 for t in s])
    return count / len(s)


def get_tfidf(data: pd.Series) -> np.ndarray:
    """Encode a Series of text string to TF-IDF.

    :param data: input data
    :type data: pd.Series
    :rtype: np.ndarray
    """
    vectorizer = CountVectorizer()
    tfidf = TfidfTransformer()
    X = vectorizer.fit_transform(data)
    X = tfidf.fit_transform(X)
    return X.toarray()
