In [2]:
import numpy as np
import pandas as pd
import my_globals
from typing import List
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re
import contractions
from functools import reduce

In [3]:
from utils import get_sub_dataset

size = 1000,
random_seed = 5
get_sub_dataset(size = size, random_seed=random_seed)
data = pd.read_csv(my_globals.DATA_DIR + f"/twitter_seed{random_seed}.csv")
data

Unnamed: 0,target,ids,date,flag,user,text
0,4,1986304945,Sun May 31 18:38:39 PDT 2009,NO_QUERY,kjgriffin18,Omfggg rob patterison won again fuck yes!
1,0,2253630799,Sat Jun 20 08:29:31 PDT 2009,NO_QUERY,chloefletch23,Defs shouldnt have wore heels last night. Now ...
2,0,1979545392,Sun May 31 03:30:31 PDT 2009,NO_QUERY,groovychick3290,@NB82 lol nope im housebound all wk til friday...
3,0,1963101679,Fri May 29 12:12:19 PDT 2009,NO_QUERY,jusjuhi,my new dress looks sort of...horrible http:/...
4,0,2228651766,Thu Jun 18 14:42:47 PDT 2009,NO_QUERY,cizzln,or they just picked the#squarespace winner lik...
...,...,...,...,...,...,...
995,0,1832774127,Sun May 17 21:28:14 PDT 2009,NO_QUERY,deadpresident,"Market gained 15% , my portfolio gained only 12%"
996,0,2285120770,Mon Jun 22 14:44:00 PDT 2009,NO_QUERY,phrakonline,damn i feel terrible...like death warmed up i...
997,4,2063802433,Sun Jun 07 04:08:00 PDT 2009,NO_QUERY,ireckon,@KirstyWrites @ajaxive i washed my makeup off ...
998,0,2064078409,Sun Jun 07 05:08:55 PDT 2009,NO_QUERY,CrunchBytes,Beeb not showing in car footage on iplayer. R...


## Pre-processing 
- Punctuation deletion
- Stopwords deletion
- Digits deletion
- Link deletion
- Decontraction
- Lemmetization


In [4]:
import my_globals
from typing import List
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import nltk
import re
import contractions
from functools import reduce


def setup_nltk():
    """Downloads necessary packages within nltk"""
    packages = ["punkt", "wordnet", "stopwords"]
    for p in packages:
        try:
            nltk.data.find(p)
        except LookupError:
            nltk.download(p)


def tokenize(s: str, how="word_tokenize") -> List[str]:
    if how == "word_tokenize":
        return word_tokenize(s)
    elif how == "split":
        return s.split()


def del_username(s: str) -> str:
    """Delete @Username from a tweet str.

    :param s: input string
    :type s: str
    :rtype: str
    """

    return " ".join([t for t in tokenize(s, how="split") if not t.startswith("@")])


def del_punc(s: str) -> str:
    """Delete punctuations from str.

    :param s: input string
    :type s: str
    :rtype: str
    """
    punc = my_globals.PUNCS
    return "".join([w for w in s if w not in punc])


def del_link(s: str) -> str:
    """Delete links from str.

    :param s: input string
    :type s: str
    :rtype: str
    """
    r = r"(https?:\/\/)?([\da-z\.-]+)\.([a-z\.]{2,6})([\/\w\.-]*)"
    return " ".join([re.sub(r, "", t) for t in tokenize(s, how="split")])


def decontract(s: str) -> str:
    """Remove contractions in text.
    e.g. I'm -> I am; she'd -> she would

    :param s: input string
    :type s: str
    :rtype: str
    """
    tokens = []
    for t in tokenize(s, how="split"):
        tokens.append(contractions.fix(t))
    return " ".join(tokens)


def del_stopwords(s: str) -> str:
    """Delete stopwords and punctuation from a string.
    Note that the type-hinting indicates that this function ought
    to be run first in the pre-processing pipeline.

    :param s: input string
    :type s: str
    """
    stop_words = set(stopwords.words('english'))

    return " ".join([t for t in tokenize(s) if t not in stop_words])


def del_digits(s: str) -> str:
    """Delete digits from str.

    :param s: input string
    :type s: str
    :rtype: str
    """
    return " ".join([w for w in tokenize(s) if not w.isdigit()])


def lemmatize(s: str) -> str:
    """Lemmatize str.

    :param s: input string
    :type s: str
    :rtype: str
    """
    lemmatizer = WordNetLemmatizer()
    return " ".join([lemmatizer.lemmatize(t) for t in tokenize(s)])


def preprocess_pipeline(s: str) -> str:
    """Run string through all pre-processing functions.

    :param s: input string
    :type s: str
    :rtype: str
    """
    s = reduce(
        lambda value, function: function(value),
        (
            del_link,
            del_username,
            decontract,
            lemmatize,
            del_stopwords,
            del_punc,
            del_digits
        ),
        s,
    )

    return s.lower()


In [5]:
import pandas as pd
import numpy as np

import datetime
from dateutil.parser import parse


def str_datetime(s: str):
    """Parse and format a datetime str to weekday and datetime.MAXYEAR
    
    :param s: input string containing datetime information
    :type s: str
    :rtype: tuple[str, str]
    """
    ss = parse(s).strftime('%a %Y-%m-%d %H:%M:%S')
    return ss[:3], ss[4:]


def cleaning(df: pd.DataFrame):
    """Cleaning script of the data (or subset).
    
    :param df: input dataframe.
    :type df: pd.DataFrame
    :rtype: pd.DataFrame
    """
    # Parse weekday and datetime
    weekday_datetime = pd.DataFrame(
        list(df.loc[:, "date"].apply(str_datetime)),
        columns=["weekday", "datetime"]
    )
    # One-hot encode weekday
    weekdaydummies = pd.get_dummies(
        weekday_datetime['weekday'], 
        prefix='weekday', 
        dtype=float
    )
    weekdaydummies = pd.DataFrame(
        weekdaydummies, 
        columns=['weekday_'+w for w in [
            "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"
        ]]
    )
    # Concatenate weekday dummies to other features
    weekdaydummies_datetime = pd.concat(
        [weekdaydummies, weekday_datetime['datetime']], 
        axis=1
    )
    df = pd.concat([df, weekdaydummies_datetime], axis=1)
    # Drop the column with single unique value.
    df.drop("flag", axis = 1, inplace=True)
    return df


In [6]:
data = pd.read_csv(my_globals.DATA_DIR + "/twitter_seed5.csv")
data = cleaning(data)
data["processed_text"] = data["text"].apply(preprocess_pipeline)
data



Unnamed: 0,target,ids,date,user,text,weekday_Mon,weekday_Tue,weekday_Wed,weekday_Thu,weekday_Fri,weekday_Sat,weekday_Sun,datetime,processed_text
0,4,1986304945,Sun May 31 18:38:39 PDT 2009,kjgriffin18,Omfggg rob patterison won again fuck yes!,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009-05-31 18:38:39,omfggg rob patterison fuck yes
1,0,2253630799,Sat Jun 20 08:29:31 PDT 2009,chloefletch23,Defs shouldnt have wore heels last night. Now ...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,2009-06-20 08:29:31,defs wore heel last night now knee playing lik...
2,0,1979545392,Sun May 31 03:30:31 PDT 2009,groovychick3290,@NB82 lol nope im housebound all wk til friday...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009-05-31 03:30:31,lol nope housebound wk til friday weather chan...
3,0,1963101679,Fri May 29 12:12:19 PDT 2009,jusjuhi,my new dress looks sort of...horrible http:/...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,2009-05-29 12:12:19,new dress look sort
4,0,2228651766,Thu Jun 18 14:42:47 PDT 2009,cizzln,or they just picked the#squarespace winner lik...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2009-06-18 14:42:47,picked squarespace winner like min ago
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0,1832774127,Sun May 17 21:28:14 PDT 2009,deadpresident,"Market gained 15% , my portfolio gained only 12%",0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009-05-17 21:28:14,market gained portfolio gained
996,0,2285120770,Mon Jun 22 14:44:00 PDT 2009,phrakonline,damn i feel terrible...like death warmed up i...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,2009-06-22 14:44:00,damn feel death warmed hope horrible cold go a...
997,4,2063802433,Sun Jun 07 04:08:00 PDT 2009,ireckon,@KirstyWrites @ajaxive i washed my makeup off ...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009-06-07 04:08:00,washed makeup look happened queen s birthday look
998,0,2064078409,Sun Jun 07 05:08:55 PDT 2009,CrunchBytes,Beeb not showing in car footage on iplayer. R...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2009-06-07 05:08:55,beeb showing car footage iplayer rb stuck behi...


In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

vectorizer = CountVectorizer()
tfidf = TfidfTransformer()
X = vectorizer.fit_transform(data["processed_text"])
X = tfidf.fit_transform(X)
X.toarray().shape

(1000, 2857)