In [7]:
import cudf
from cuml import RandomForestClassifier as cuRF

from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer 

import numpy as np
import pandas as pd
import re
from sklearn.base import TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.preprocessing import OneHotEncoder, MultiLabelBinarizer

from IPython.display import display, HTML

In [2]:
def extract_hashtags(text):
    return re.findall("(#\w+)", text)

def extract_link(text):
    return re.findall(r"https?://[\w./-]+", text)

def extract_handles(text):
    return re.findall("@\S+", text)

def remove_elements_from_text(row, text_col, remove_col_list):
    text = row[text_col]
    for cur_col in remove_col_list:
        text = " ".join([x for x in text.split() if x not in row[cur_col]])
    return text

In [3]:
def clean_location(row, location_col='location'):
    if row[location_col] is np.nan or not row[location_col].strip().replace("\'|#|@", ""):
        return "N/A"
    ret_val = re.sub("\W+", ' ', row[location_col].lower())
    ret_val = re.sub("[\s\d]+", ' ', ret_val.strip()).strip()
    if len(ret_val) == 0:
        return "N/A"
    else:
        return ret_val

In [4]:
# from nltk.tag import StanfordNERTagger, StanfordPOSTagger

- Function to take the sentence and output the lemmatized list
    - Input is text
    - Next tokenize and get pos_tag
    - Use the pos_tage with token as input to the lemmatizer
    

In [4]:
def lemmatize_sentence(text):
    tokens_pos_tag = pos_tag(tknzr.tokenize(text))
    tag_dict = {
        "J": wordnet.ADJ,
        "N": wordnet.NOUN,
        "V": wordnet.VERB,
        "R": wordnet.ADV,
    }
    tokens_wordnet = [
        (word, tag_dict.get(pos[0], wordnet.NOUN)) for word, pos in tokens_pos_tag
    ]
    lem_tokens = [wnl.lemmatize(word, pos) for word, pos in tokens_wordnet]
    return lem_tokens

In [5]:
class CudfConversion(TransformerMixin):
    def __init__(self):
#         self.cudf = cudf.DataFrame()
        pass
    
    def fit(self, X, y=None):
        return self
    
#     def fit_transform(self, X, y=None):
#         self.df = cudf.from_pandas(pd.DataFrame.sparse.from_spmatrix(data=X))
#         return df
    
    def transform(self, X, y=None):
        return pd.DataFrame(data=X.todense()).astype('float32')
#         self.cudf = cudf.from_pandas(pd.DataFrame.sparse.from_spmatrix(data=X)).astype('float32')
#         return self.cudf

In [214]:
postag = StanfordPOSTagger('english-bidirectional-distsim.tagger',
                           '/home/sjustice/My_code/stanford-tagger-4.0.0/models/english-bidirectional-distsim.tagger')

LookupError: 

===========================================================================
NLTK was unable to find the english-bidirectional-distsim.tagger file!
Use software specific configuration paramaters or set the STANFORD_MODELS environment variable.
===========================================================================

In [None]:
nert = StanfordNERTagger('/home/sjustice/My_code/stanford-tagger-4.0.0/models/')

In [8]:
from nltk import download
download('averaged_perceptron_tagger')
download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/sjustice/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/sjustice/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
wnl = WordNetLemmatizer()
tknzr = TweetTokenizer(reduce_len=True)

train_df = pd.read_csv("train.csv").assign(
    keyword=lambda x: x['keyword'].fillna("N/A"),
    hash_in_text=lambda x: x["text"].str.contains("#\w+"),
    hashtags=lambda x: x["text"].apply(extract_hashtags),
    link_in_text=lambda x: x["text"].str.contains("http"),
    links=lambda x: x["text"].apply(extract_link),
    handle_in_text=lambda x: x['text'].str.contains("@\S+"),
    handles=lambda x: x['text'].apply(extract_handles),
    reduced_text=lambda x: x.apply(remove_elements_from_text, text_col='text',
                                   remove_col_list=['hashtags', 'links', 'handles'], axis=1),
    location_orig=lambda x: x.apply(clean_location, axis=1),
    tokens=lambda x: x['text'].apply(tknzr.tokenize),
    lem_tokens=lambda x: x['text'].apply(lemmatize_sentence),
    lem_text=lambda x: x['lem_tokens'].apply(" ".join)
)

single_location_list = train_df.groupby("location_orig", as_index=False).agg({"id": "count"}).rename(
    columns={"id": "count"}
).query("count == 1")["location_orig"].tolist()

train_df = train_df.assign(location = train_df["location_orig"].where(
    ~train_df["location_orig"].isin(single_location_list), other="N/A"
))

- There are a lot of location categories that only appear once
    - 2818 locations only appear once
    - Some of them are in lowercase
    - Convert the location to lower and remove punctuation
    - Maybe remove those or ones with less than 5 occurances and replace them with None?

- Replace all locations that appear only once with N/A

- Perform a train/test split and then use the stemmer and the vectorizer on the train set
- Need to be able to align the features from the train set so that they are the same in the test set
    - Only include the ones in the train set - No new features in the test set
    
- Perform lematization on the text. 

In [134]:
# Use a column transformer instead of the FeatureUnion since it accomplishes the same thing without
# needing a custom class
# Will need a custom class for the cudf conversion though (I think??)
column_trans = ColumnTransformer([("ohe", OneHotEncoder(handle_unknown='ignore', dtype=np.int32), cat_feature_list),
                                  ("tfidf", TfidfVectorizer(dtype=np.float32), 'lem_text')])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    train_df[
        [
            "keyword",
            "location",
            "text",
            "hash_in_text",
            "link_in_text",
            "handle_in_text",
#             "tokens",
#             "lem_tokens",
            "lem_text",
        ]
    ],
    train_df["target"],
    random_state=42,
    stratify=train_df["target"],
    test_size=0.2,
)

In [9]:
cat_feature_list =['keyword', 'location', 'hash_in_text', 'handle_in_text']

In [51]:
pipe = Pipeline(
    [
        (
            "text_transform",
            ColumnTransformer([("ohe", OneHotEncoder(handle_unknown='ignore'), cat_feature_list),
                                  ("tfidf", TfidfVectorizer(), 'lem_text')]),
        ),
#         ("cudf_convert", CudfConversion()),
        ("rf", RandomForestClassifier(random_state=42, n_jobs=1) )
    ]
)

- Things to check for in the text
    - @s - tweets directed at other users
    - #s - hashtags
    - retweets - urls
    - Name entity
    - Allcaps words??? - Remove them or keep them?
    - Remove the ats, hashtags, and urls from the text before putting it through a tfidf vectorizer
    - A lot of repeated location categories - Need to handle instances like 'Chicago, IL' and 'Chicago,IL'

In [48]:
param_grid = {
    "rf__n_estimators": range(3000, 10001, 500),
    "rf__max_depth": [None, 10, 20, 40],
    "rf__min_samples_split": [2, 4, 6],
    "rf__min_samples_leaf": [1, 2, 3],
    "rf__max_features": ["auto", "log2", 20, 40, 60, 100],
    "text_transform__tfidf__ngram_range": [(1,1), (1,2), (1,3), (2,3)],
    "text_transform__tfidf__max_df": [0.9, 0.95, 0.99],
    "text_transform__tfidf__min_df": [2, 5, 10]
}

In [52]:
rcv = RandomizedSearchCV(estimator=pipe, param_distributions=param_grid, n_iter=100,
                         scoring='f1', cv=3, random_state=42, verbose=2, n_jobs=15)

In [53]:
rcv.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=15)]: Using backend LokyBackend with 15 concurrent workers.
[Parallel(n_jobs=15)]: Done  11 tasks      | elapsed:   41.1s
[Parallel(n_jobs=15)]: Done 132 tasks      | elapsed:  5.8min
[Parallel(n_jobs=15)]: Done 300 out of 300 | elapsed: 11.1min finished


RandomizedSearchCV(cv=3,
                   estimator=Pipeline(steps=[('text_transform',
                                              ColumnTransformer(transformers=[('ohe',
                                                                               OneHotEncoder(handle_unknown='ignore'),
                                                                               ['keyword',
                                                                                'location',
                                                                                'hash_in_text',
                                                                                'handle_in_text']),
                                                                              ('tfidf',
                                                                               TfidfVectorizer(),
                                                                               'lem_text')])),
                                             (

In [54]:
rcv.best_params_

{'text_transform__tfidf__ngram_range': (1, 1),
 'text_transform__tfidf__min_df': 10,
 'text_transform__tfidf__max_df': 0.9,
 'rf__n_estimators': 9500,
 'rf__min_samples_split': 4,
 'rf__min_samples_leaf': 1,
 'rf__max_features': 'auto',
 'rf__max_depth': None}

In [55]:
rcv.best_score_

0.7003852673755585

In [10]:
from cuml import RandomForestClassifier as cuRF

In [10]:
cuda_pipe = Pipeline(
    [
        (
            "text_transform",
            ColumnTransformer([("ohe", OneHotEncoder(handle_unknown='ignore', dtype=np.int32), cat_feature_list),
                              ("tfidf", TfidfVectorizer(dtype=np.float32), 'lem_text')]),
        ),
        ("cudf_convert", CudfConversion()),
        ("curf", cuRF() )
    ]
)

In [11]:
cuda_param_grid = {
#     "curf__convert_dtype": [True],
    "curf__n_estimators": range(3000, 10001, 500),
    "curf__max_depth": [10, 20, 40],
#     "curf__min_samples_split": [2, 4, 6],
    "curf__min_rows_per_node": [1, 2, 3],
    "curf__max_features": ["auto", "log2", 20, 40, 60, 100],
    "text_transform__tfidf__ngram_range": [(1,1), (1,2), (1,3), (2,3)],
    "text_transform__tfidf__max_df": [0.9, 0.95, 0.99],
    "text_transform__tfidf__min_df": [2, 5, 10]
}

In [12]:
cuda_rcv = RandomizedSearchCV(estimator=cuda_pipe, param_distributions=cuda_param_grid, n_iter=100,
                              scoring='f1', cv=3, random_state=42, verbose=2, n_jobs=1)

In [None]:
cuda_rcv.fit(X_train, y_train, curf__convert_dtype=True)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] text_transform__tfidf__ngram_range=(1, 3), text_transform__tfidf__min_df=2, text_transform__tfidf__max_df=0.9, curf__n_estimators=9000, curf__min_rows_per_node=2, curf__max_features=20, curf__max_depth=40 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [56]:
gcv.best_score_

0.7204523306750437

In [65]:
cuda_gcv = GridSearchCV(estimator=cuda_pipe, param_grid=cuda_param_grid, scoring='f1', cv=3, verbose=2, n_jobs=4)