In [93]:
from nltk.tokenize import TweetTokenizer
import numpy as np
import pandas as pd
import re
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

from IPython.display import display, HTML

In [2]:
from nltk.stem import WordNetLemmatizer 
import nltk
from nltk.corpus import wordnet

In [3]:
def extract_hashtags(text):
    return re.findall("(#\w+)", text)

def extract_link(text):
    return re.findall(r"https?://[\w./-]+", text)

def extract_handles(text):
    return re.findall("@\S+", text)

def remove_elements_from_text(row, text_col, remove_col_list):
    text = row[text_col]
    for cur_col in remove_col_list:
        text = " ".join([x for x in text.split() if x not in row[cur_col]])
    return text

In [4]:
def clean_location(row, location_col='location'):
    if row[location_col] is np.nan or not row[location_col].strip().replace("\'|#|@", ""):
        return "N/A"
    ret_val = re.sub("\W+", ' ', row[location_col].lower())
    ret_val = re.sub("[\s\d]+", ' ', ret_val.strip()).strip()
    if len(ret_val) == 0:
        return "N/A"
    else:
        return ret_val

In [212]:
from nltk.tag import StanfordNERTagger, StanfordPOSTagger

In [8]:
wnl = WordNetLemmatizer()
tknzr = TweetTokenizer(reduce_len=True)

- Function to take the sentence and output the lemmatized list
    - Input is text
    - Next tokenize and get pos_tag
    - Use the pos_tage with token as input to the lemmatizer
    

In [6]:
def lemmatize_sentence(text):
    tokens_pos_tag = nltk.pos_tag(tknzr.tokenize(text))
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }
    tokens_wordnet = [
        (word, tag_dict.get(pos[0], wordnet.NOUN)) for word, pos in tokens_pos_tag
    ]
    lem_tokens = [wnl.lemmatize(word, pos) for word, pos in tokens_wordnet]
    return lem_tokens

In [98]:
class CatFeatureTransformer(TransformerMixin):
    def __init__(self, col_list):
        self.cols = col_list
        self.ohe = OneHotEncoder(handle_unknown="ignore")

    def fit(self, X, y=None):
        return self.ohe.fit(X[self.cols])

    def fit_transform(self, X, y=None):
        return self.ohe.fit_transform(X[self.cols])

    def transform(self, X, y=None):
        return self.ohe.transform(X[self.cols])
    
    def get_feature_names(self):
        return self.ohe.get_feature_names()

In [129]:
class TfidfTransformer(TransformerMixin):
    def __init__(self, text_col, **kwargs):
        self.text_col = text_col
        self.tfidf = TfidfVectorizer(stop_words='english', **kwargs)
        
    def fit(self, X, y=None):
        return self.tfidf.fit(X[self.text_col])
    
    def fit_transform(self, X, y=None):
        return self.tfidf.fit_transform(X[self.text_col])
    
    def transform(self, X, y=None):
        return self.tfidf.transform(X[self.text_col])
    
    def get_feature_names(self):
        return self.tfidf.get_feature_names()
    
    def set_params(self, **params):
        return self.tfidf.set_params(**params)

In [214]:
postag = StanfordPOSTagger('english-bidirectional-distsim.tagger',
                           '/home/sjustice/My_code/stanford-tagger-4.0.0/models/english-bidirectional-distsim.tagger')

LookupError: 

===========================================================================
NLTK was unable to find the english-bidirectional-distsim.tagger file!
Use software specific configuration paramaters or set the STANFORD_MODELS environment variable.
===========================================================================

In [None]:
nert = StanfordNERTagger('/home/sjustice/My_code/stanford-tagger-4.0.0/models/')

In [30]:
train_df = pd.read_csv("train.csv").assign(
    keyword=lambda x: x['keyword'].fillna("N/A"),
    hash_in_text=lambda x: x["text"].str.contains("#\w+"),
    hashtags=lambda x: x["text"].apply(extract_hashtags),
    link_in_text=lambda x: x["text"].str.contains("http"),
    links=lambda x: x["text"].apply(extract_link),
    handle_in_text=lambda x: x['text'].str.contains("@\S+"),
    handles=lambda x: x['text'].apply(extract_handles),
    reduced_text=lambda x: x.apply(remove_elements_from_text, text_col='text',
                                   remove_col_list=['hashtags', 'links', 'handles'], axis=1),
    location_orig=lambda x: x.apply(clean_location, axis=1),
    tokens=lambda x: x['text'].apply(tknzr.tokenize),
    lem_tokens=lambda x: x['text'].apply(lemmatize_sentence),
    lem_text=lambda x: x['lem_tokens'].apply(" ".join)
)

single_location_list = train_df.groupby("location_orig", as_index=False).agg({"id": "count"}).rename(
    columns={"id": "count"}
).query("count == 1")["location_orig"].tolist()

train_df = train_df.assign(location = train_df["location_orig"].where(
    ~train_df["location_orig"].isin(single_location_list), other="N/A"
))

- There are a lot of location categories that only appear once
    - 2818 locations only appear once
    - Some of them are in lowercase
    - Convert the location to lower and remove punctuation
    - Maybe remove those or ones with less than 5 occurances and replace them with None?

- Replace all locations that appear only once with N/A

- Perform a train/test split and then use the stemmer and the vectorizer on the train set
- Need to be able to align the features from the train set so that they are the same in the test set
    - Only include the ones in the train set - No new features in the test set
    
- Perform lematization on the text. 

In [20]:
vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1,2), max_df=0.9, min_df=10)

In [31]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df[
        [
            "keyword",
            "location",
            "text",
            "hash_in_text",
            "link_in_text",
            "handle_in_text",
            "tokens",
            "lem_tokens",
            "lem_text",
        ]
    ],
    train_df["target"],
    random_state=42,
    stratify=train_df["target"],
    test_size=0.2,
)

In [92]:
cat_feature_list =['keyword', 'location', 'hash_in_text', 'handle_in_text']
cft = CatFeatureTransformer(cat_feature_list)

In [85]:
cft.fit(X_train)

OneHotEncoder(handle_unknown='ignore')

In [88]:
tft = TfidfTransformer(text_col='lem_text', ngram_range=(1,2), max_df=0.9, min_df=10)

In [90]:
tft.fit_transform(X_train).todense()[:10]

matrix([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]])

In [101]:
union = FeatureUnion([('ohe', CatFeatureTransformer(cat_feature_list)),
                      ('tfidf', TfidfTransformer(text_col='lem_text'))])

In [103]:
union.fit(X_train)

FeatureUnion(transformer_list=[('ohe', OneHotEncoder(handle_unknown='ignore')),
                               ('tfidf',
                                TfidfVectorizer(stop_words='english'))])

In [106]:
union.transformer_list

[('ohe', OneHotEncoder(handle_unknown='ignore')),
 ('tfidf', TfidfVectorizer(stop_words='english'))]

In [107]:
pipe.named_steps

{'text_union': FeatureUnion(transformer_list=[('ohe',
                                 <__main__.CatFeatureTransformer object at 0x7fbfc6aa45d0>),
                                ('tfidf',
                                 <__main__.TfidfTransformer object at 0x7fbfc6aa4210>)]),
 'rf': RandomForestClassifier(n_jobs=2, random_state=42)}

In [130]:
pipe = Pipeline(
    [
        (
            "text_union",
            FeatureUnion(
                [
                    ("ohe", CatFeatureTransformer(cat_feature_list)),
                    ("tfidf", TfidfTransformer(text_col="lem_text")),
                ]
            ),
        ),
        ("rf", RandomForestClassifier(random_state=42, n_jobs=2) )
    ]
)

In [37]:
def transform_df(df, text_col, ohe_col_list, tfidf_vec, ohe):
    return pd.concat([pd.DataFrame.sparse.from_spmatrix(
        data=ohe.transform(df[ohe_col_list]), columns=ohe.get_feature_names()),
        pd.DataFrame.sparse.from_spmatrix(
        data=tfidf_vec.transform(df[text_col]), columns=tfidf_vec.get_feature_names()
    )], axis=1)

In [47]:
train_sparse_df = transform_df(df=X_train, text_col='lem_text',
                               ohe_col_list=['keyword', 'location', 'hash_in_text', 'handle_in_text'],
                               ohe=ohe, tfidf_vec=vectorizer)

In [48]:
test_sparse_df = transform_df(df=X_test, text_col='lem_text',
                               ohe_col_list=['keyword', 'location', 'hash_in_text', 'handle_in_text'],
                               ohe=ohe, tfidf_vec=vectorizer)

- Things to check for in the text
    - @s - tweets directed at other users
    - #s - hashtags
    - retweets - urls
    - Name entity
    - Allcaps words??? - Remove them or keep them?
    - Remove the ats, hashtags, and urls from the text before putting it through a tfidf vectorizer
    - A lot of repeated location categories - Need to handle instances like 'Chicago, IL' and 'Chicago,IL'

In [40]:
clf = RandomForestClassifier(random_state=42, n_jobs=2)

In [122]:
param_grid = {
    "rf__n_estimators": range(100, 10001, 500),
    "rf__max_depth": [None, 5, 10],
    "rf__min_samples_split": [2, 4, 6],
    "rf__min_samples_leaf": [1, 2, 3],
    "rf__max_features": ["auto", "log2", "5", "10"],
    "text_union__tfidf__ngram_range": [(1,1), (1,2), (1,3), (2,3)],
    "text_union__tfidf__max_df": [0.9, 0.95, 0.99],
    "text_union__tfidf__min_df": [2, 5, 10]
}

In [131]:
rcv = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=30,
                         scoring='f1', cv=3, random_state=42, verbose=2, n_jobs=4)

In [132]:
rcv.fit(X_train, y_train)

Fitting 3 folds for each of 30 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  2.0min
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:  5.5min finished


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('text_union',
                                              FeatureUnion(transformer_list=[('ohe',
                                                                              <__main__.CatFeatureTransformer object at 0x7fbfc6645a50>),
                                                                             ('tfidf',
                                                                              <__main__.TfidfTransformer object at 0x7fbfc6645a90>)])),
                                             ('rf',
                                              RandomForestClassifier(n_jobs=2,
                                                                     random_state=42))]),
                   n_iter=30, n_jobs=4,
                   param_distributions={'rf__max_depth': [None, 5, 10],
                                        'rf__max_features': ['auto', 'log2',
                                                           

In [133]:
rcv.best_params_

{'text_union__tfidf__ngram_range': (1, 2),
 'text_union__tfidf__min_df': 2,
 'text_union__tfidf__max_df': 0.9,
 'rf__n_estimators': 7100,
 'rf__min_samples_split': 6,
 'rf__min_samples_leaf': 3,
 'rf__max_features': 'auto',
 'rf__max_depth': None}

In [134]:
rcv.best_score_

0.7053640177844493

In [65]:
gcv = GridSearchCV(estimator=clf, param_grid=param_grid, scoring='f1', cv=3, verbose=2, n_jobs=4)

In [66]:
gcv.fit(train_sparse_df, y_train)

Fitting 3 folds for each of 2160 candidates, totalling 6480 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:  9.5min


KeyboardInterrupt: 

In [56]:
gcv.best_score_

0.7204523306750437