# Test preprocess

In [4]:
df = pd.read_csv("data/tokyo_2020_tweets.csv")

In [5]:

from collections import Counter
import logging 
import pandas as pd
import ast
import os

from tqdm import tqdm
from langdetect import detect
import re 

def remove_hashtags(tokens):
    tokens = filter(lambda x: "#" not in x, tokens)
    return list(tokens)

def remove_hastags_sign(tokens):
    tokens = map(lambda x: x.replace('#',""), tokens)
    return list(tokens)

def remove_url(tokens):
    tokens = filter(lambda x: "http" not in x, tokens)
    return list(tokens)

def remove_html(tokens):
    tokens = filter(lambda x: x[0]+x[-1] != '<>', tokens)
    return list(tokens)

def remove_mentions(tokens):
    tokens = filter(lambda x: "@" not in x, tokens)
    return list(tokens)

def detect_language(text):
  try: 
    return detect(text)
  except: 
    print("unreconnized character")

def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F" # emoticons
                           u"\U0001F300-\U0001F5FF" # symbols & pictographs
                           u"\U0001F680-\U0001F6FF" # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF" # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)



# PREPROCESS

In [None]:
class DataPreprocessor():
    def __init__(self, data:pd.DataFrame, column: str):
        self.column = column
        self.data = data
        self.corpus = self.data[column].astype(str).array
    @property
    def preprocess(self,f_hastags = remove_hashtags, threshold = 50, url = True, html = True, hashtags=True, mentions = True):
        """_summary_

        Args:
            url (bool, optional): _description_. Defaults to True.
            html (bool, optional): _description_. Defaults to True.
            hashtags (bool, optional): _description_. Defaults to True.
            mentions (bool, optional): _description_. Defaults to True.

        Returns:
            _type_: _description_
        """
        tokenizer = TweetTokenizer()
        tokenized_sentences = []
        logging.info("tokenizing")
        for sentence in tqdm(self.corpus):
            tokens = tokenizer.tokenize(sentence)
            if url :
                tokens = remove_url(tokens)
            if html: 
                tokens = remove_html(tokens)
            if hashtags: 
                tokens = f_hastags(tokens)
            if mentions: 
                tokens = remove_mentions(tokens)
            tokens = list(map(lambda x: x.lower(), tokens))
            tokenized_sentences.append(tokens)
        logging.info("Phrasing")
        phrases = Phrases(tokenized_sentences, threshold=threshold)
        phraser = Phraser(phrases)
        clean_corpus = []
        for sentence in tokenized_sentences:
            clean_corpus.append(phraser[sentence])
        return [" ".join(i) for i in clean_corpus]
    
    def get_df(self, to_csv, language='en', detect_language_=True, remove_emojis=False):
        """_summary_

        Args:
            language (str, optional): _description_. Defaults to 'en'.
            detect_language_ (bool, optional): _description_. Defaults to True.

        Returns:
            _type_: _description_
        """
        
        logging.info("****PREPROCESSING COLUMN {} OF TWEETS DF".format(self.column))
        df = self.data.copy()
        if remove_emojis : 
            df[self.column] = df[self.column].apply(lambda x : str(remove_emoji(x)))

        df["cleaned_{}".format(self.column)] = self.preprocess
        if detect_language_:
            logging.info("language detection")
            tqdm.pandas()
            df["language"] = df["cleaned_{}".format(self.column)].progress_apply(lambda x : detect_language(x))
            df = df[df['language'] == language]
            shape = df.shape[0]
            df = df.drop(["language"],axis=1)
            logging.info("{} % of tweets in {}".format(df.shape[0]/shape*100, language))
        if to_csv: 
            df.to_csv(structure_dict["output_path"] + "preprocessed_df.csv", index=False)
        return df

In [18]:
column = 'label'
csv_path = "C:/Users/Stanislasd’Orsetti/NLP_project/data/labellized_hashtags_df.csv"
df = pd.read_csv(csv_path)


In [52]:
label_column = "label"

In [60]:
df["Badminton"].loc[:].apply(lambda x: int(x))

0        0
1        0
2        0
3        0
4        0
        ..
11737    0
11738    0
11739    0
11740    0
11741    0
Name: Badminton, Length: 11742, dtype: int64

In [62]:
import pandas as pd
from sklearn.model_selection import train_test_split



In [63]:
column = 'label'
csv_path = "C:/Users/Stanislasd’Orsetti/NLP_project/data/labellized_hashtags_df.csv"
df = pd.read_csv(csv_path)


df_dummies = df[label_column].apply(lambda x : ast.literal_eval(x)).str.join('|').str.get_dummies()
list_columns =df_dummies.columns.tolist()
df_expl = df.copy()
df= pd.concat([df_expl, df_dummies], axis=1 )

for elem in list_columns : 
    df[elem].loc[df[elem].isnull()] = df[elem].loc[df[elem].isnull()].apply(lambda x: 0)
    df[elem].loc[:]=df[elem].loc[:].apply(lambda x: int(x))
df = df.loc[:,~df.columns.duplicated()]
LABEL_COLUMNS = df_dummies.columns.tolist()[:]

train_df, val_df = train_test_split(df, test_size=0.05)

train_with = train_df[train_df[LABEL_COLUMNS].sum(axis=1) > 0]
train_without= train_df[train_df[LABEL_COLUMNS].sum(axis=1) == 0]

train_df = pd.concat(
    [train_with.sample(600),
    train_without]
    )


In [65]:
import sys
sys.path.insert(0,"C:/Users/Stanislasd’Orsetti/NLP_project/")

from hashtags.modelisation.databuilder import create_dataset
from config import structure_dict
from hashtags.modelisation.model import LABEL_COLUMNS, HashtagTweetTagger
from hashtags.modelisation.torch_dataset import HashTagTweetDataModule
from hashtags.modelisation.utils import *
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from transformers import BertTokenizerFast as BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import torch
from tqdm import tqdm



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


TypeError: cannot convert the series to <class 'int'>

In [None]:

#BUILD TRAINING DF
train_df, val_df = create_dataset(df,column)
train_df = train_df.copy()
val_df = val_df.copy()

tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)

#TORCHDATA
data_module = HashTagTweetDataModule(
  train_df,
  val_df,
  tokenizer,
  batch_size=BATCH_SIZE,
  max_token_len=MAX_TOKEN_COUNT
)

#MODEL
model = HashtagTweetTagger(
  n_classes=len(LABEL_COLUMNS),
  n_warmup_steps=warmup_steps,
  n_training_steps=total_training_steps
)

#TRAINING
logger = TensorBoardLogger("lightning_logs", name="hashtags_tweets")
checkpoint_callback = ModelCheckpoint(
  dirpath="checkpoints",
  filename="best-checkpoint",
  save_top_k=1,
  verbose=True,
  monitor="val_loss",
  mode="min"
)

early_stopping_callback = EarlyStopping(monitor='val_loss', patience=2)

trainer = pl.Trainer(
  accelerator="cpu",
  logger=logger,
  checkpoint_callback=checkpoint_callback,
  callbacks=[early_stopping_callback],
  max_epochs=N_EPOCHS,
  gpus=1,
  progress_bar_refresh_rate=30,
)

trainer.fit(model, data_module)
trainer.test()

torch.save(model.state_dict(), "hashtags/model.pt")

