In [None]:
# !pip install transformers

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from wordcloud import WordCloud 
from collections import Counter
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
import seaborn as sns
import matplotlib.pyplot as plt
import os
import string
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer ,CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression 
from sklearn.ensemble import RandomForestClassifier ,AdaBoostClassifier ,GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB ,MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense ,LSTM ,Dropout ,BatchNormalization,Bidirectional ,Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [None]:
files = {'train':'../input/nlp-getting-started/train.csv',
        'test':'../input/nlp-getting-started/test.csv',
         'sample':'../input/nlp-getting-started/sample_submission.csv'
        }

In [None]:
train = pd.read_csv(files['train'])
train.head()

In [None]:
print(train.isna().sum())

In [None]:
train.info()

In [None]:
train.isna().sum().sort_values(ascending=False)/train.shape[0]

In [None]:
print('NO. Unique Keyword : ',len(train.keyword.unique()))
train.keyword.value_counts(ascending=False)

In [None]:
print('NO. Unique Location : ',len(train.location.unique()))
train.location.value_counts(ascending=False)

In [None]:
train.duplicated().sum()

In [None]:
train.describe()

In [None]:
train.drop('id',inplace=True,axis=1)

In [None]:
train.head()

In [None]:
train.text.values[:20]

In [None]:
train.target.value_counts()

In [None]:
train.isna().sum()

In [None]:
train['word_counts'] = train['text'].apply(lambda x : len(x.split()))
train.head()

In [None]:
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}

In [None]:
# Remove all URLs, replace by URL
def remove_URL(text):
    url = re.compile(r'https?://\S+')
    return url.sub(r'URL',text)

# Remove HTML beacon
def remove_HTML(text):
    html=re.compile(r'<.*?>')
    return html.sub(r'',text)

# Remove non printable characters
def remove_not_ASCII(text):
    text = ''.join([word for word in text if word in string.printable])
    return text

# Change an abbreviation by its true meaning
def word_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

# Replace all abbreviations
def replace_abbrev(text):
    string = ""
    for word in text.split():
        string += word_abbrev(word) + " "        
    return string

# Remove @ and mention, replace by USER
def remove_mention(text):
    at=re.compile(r'@\S+')
    return at.sub(r'USER',text)

# # Remove numbers, replace it by NUMBER
# def remove_number(text):
#     num = re.compile(r'[-+]?[.\d]*[\d]+[:,.\d]*')
#     return num.sub(r'NUMBER', text)

# # Remove numbers, replace it by NUMBER
def remove_number(text):
    number = re.compile(r'\d+')
    return number.sub(r' number ', text)

# Remove all emojis, replace by EMOJI
# def remove_emoji(text):
#     emoji_pattern = re.compile("["
#                            u"\U0001F600-\U0001F64F"  # emoticons
#                            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
#                            u"\U0001F680-\U0001F6FF"  # transport & map symbols
#                            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
#                            u"\U00002702-\U000027B0"
#                            u"\U000024C2-\U0001F251"
#                            "]+", flags=re.UNICODE)
#     return emoji_pattern.sub(r'EMOJI', text)

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r' emoji ', string)


# Replace some others smileys with SADFACE
def transcription_sad(text):
    eyes = "[8:=;]"
    nose = "['`\-]"
    smiley = re.compile(r'[8:=;][\'\-]?[(\\/]')
    return smiley.sub(r'SADFACE', text)

# Replace some smileys with SMILE
def transcription_smile(text):
    eyes = "[8:=;]"
    nose = "['`\-]"
    smiley = re.compile(r'[8:=;][\'\-]?[)dDp]')
    #smiley = re.compile(r'#{eyes}#{nose}[)d]+|[)d]+#{nose}#{eyes}/i')
    return smiley.sub(r'SMILE', text)

# Replace <3 with HEART
def transcription_heart(text):
    heart = re.compile(r'<3')
    return heart.sub(r'HEART', text)

# Factorize elongated words, add ELONG
def remove_elongated_words(text):
    rep = re.compile(r'\b(\S*?)([a-z])\2{2,}\b')
    return rep.sub(r'\1\2 ELONG', text)

# Factorize repeated punctuation, add REPEAT
def remove_repeat_punct(text):
    rep = re.compile(r'([!?.]){2,}')
    return rep.sub(r'\1 REPEAT', text)


# Remove all punctuations
def remove_all_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)

# Remove punctuations
def remove_punct(text):
    punctuations = '@#!?+&*[]-%.:/();$=><|{}^' + "'`" 
    for p in punctuations:
        text = text.replace(p, f' {p} ')

    text = text.replace('...', ' ... ')
    if '...' not in text:
        text = text.replace('..', ' ... ')   
    return text

# Remove all english stopwords
def remove_stopwords(text):
    text = ' '.join([word for word in text.split() if word not in (stopwords)])
    return text

In [None]:
def clean_tweet(text):
    
    # Remove non text
    text = remove_URL(text)
    text = remove_HTML(text)
    text = remove_not_ASCII(text)
    
    # Lower text, replace abbreviations
    text = text.lower()
    text = replace_abbrev(text)  
    text = remove_mention(text)
    text = remove_number(text)
    
    # Remove emojis / smileys
    text = remove_emoji(text)
    text = transcription_sad(text)
    text = transcription_smile(text)
    text = transcription_heart(text)
    
    # Remove repeated puntuations / words
    text = remove_elongated_words(text)
    text = remove_repeat_punct(text)

    #text = remove_all_punct(text)
    #text = remove_punct(text)
    #text = remove_stopwords(text)

    return text

In [None]:
train["cl_text"] = train["text"].apply(clean_tweet)

In [None]:
train['cl_text']

In [None]:
train['clean_text'] = train['text'].apply(lambda x: x.lower())
train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join([re.sub('^@(\w)+','USER',tw) for tw in x.split()]))
train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join([re.sub('^#(\w)+','USER',tw) for tw in x.split()]))

train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join([re.sub('\'s','is',tw) for tw in x.split()]))

train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join([re.sub('n\'t','not',tw) for tw in x.split()]))

train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join([re.sub('\'m','am',tw) for tw in x.split()]))

train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join([re.sub('iam','i am',tw) for tw in x.split()]))
url = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
train['clean_text'] = train['clean_text'].apply(lambda x:' '.join([re.sub(url,'URL',tw) for tw in x.split()])) 
lemmatizer = WordNetLemmatizer() 
train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(x)]))
stop_words = set(stopwords.words('english'))
train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join(word for word in word_tokenize(x) if word not in stop_words))
train['clean_text'] = train['clean_text'].apply(lambda x: ' '.join(word for word in word_tokenize(x) if word not in string.punctuation))

train['clean_text'] = train['clean_text'].apply(lambda x: " ".join(word_tokenize(x)))
train.head()

In [None]:
train['clean_word_counts'] = train['clean_text'].apply(lambda x : len(x.split()))
train.head()

In [None]:
np.max(train['word_counts']), np.min(train['word_counts'])

In [None]:
np.max(train['clean_word_counts']), np.min(train['clean_word_counts'])

In [None]:
# length of the sequence text
length =[len(i.split()) for i in train.text]
np.max(length)

In [None]:
all_word = ' '.join([text for text in train['clean_text']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_word) 
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
all_word = ' '.join([text for text in train['text']])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_word) 
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud)
plt.axis('off')
plt.show()

In [None]:
# Make A List Of All Words in DataFrame
def common_words_list(column):
    words = []
    for i in train[column]:
        words.extend(i.split())
    print('unique words : ',len(set(words)))
    print('total words :',len(words))
    return words

In [None]:
org_words = common_words_list('text')
print('clean : ')
clean_words = common_words_list('clean_text')

In [None]:
# Most Common 20 Words
Counter(org_words).most_common(10)

In [None]:
# Most Common 20 Words
Counter(clean_words).most_common(10)

In [None]:
def to_lower_case(text):
    text = [' '.join(tx.lower() for tx in word.split()) for word in text]
    return text

def drop_retweet(text):
    tweets = [' '.join([re.sub('^@(\w)+','USER',tw) for tw in tweet.split()]) for tweet in text]  
    return tweets

def drop_hash(text):
    tweets = [' '.join([re.sub('"#[\w]*"','HASH',tw) for tw in tweet.split()]) for tweet in text]  
    return tweets


def drop_url(text):
    url = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
    tweets = [' '.join([re.sub(url,'URL',tw) for tw in tweet.split()]) for tweet in text]  
    return tweets

def lemtize_tweets(tweets):
    lemmatizer = WordNetLemmatizer() 
    tweets = [' '.join(lemmatizer.lemmatize(word) for word in word_tokenize(tweet)   ) for tweet in tweets]
    return tweets

def drop_stop_words(tweets):
    stop_words = set(stopwords.words('english'))
    tweets     = [' '.join(word for word in word_tokenize(tweet) if word not in stop_words) for tweet in tweets]
    return tweets


def clean_spaces(tweets):
    tweets = [" ".join(word_tokenize(tweet)) for tweet in tweets]
    return tweets


In [None]:
def ngrams(number,corpus):
    count_vec = CountVectorizer(ngram_range=(number, number)).fit(corpus)
    bag_of_words = count_vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in count_vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq


In [None]:
def get_clean_data(target):
    x_clean = to_lower_case(train[train["target"] == target]["text"])
    x_clean = drop_retweet(x_clean)
    x_clean = drop_url(x_clean)
    x_clean = lemtize_tweets(x_clean)
    x_clean = drop_stop_words(x_clean)
    x_clean = clean_spaces(x_clean)
    return x_clean

In [None]:
# N Gram with target 0
ngrams(3,get_clean_data(0))[:20]

In [None]:
# Ngram With target 1
ngrams(3,get_clean_data(1))[:20]

In [None]:
to_lower_pipe     = FunctionTransformer(to_lower_case)

drop_retweet_pipe    = FunctionTransformer(drop_retweet)

drop_links_pipe      = FunctionTransformer(drop_url)

lematize_pipe        = FunctionTransformer(lemtize_tweets)

clean_spaces_pipe    = FunctionTransformer(clean_spaces)

drop_stop_words_pipe = FunctionTransformer(drop_stop_words)

drop_hash_pipe = FunctionTransformer(drop_hash)

In [None]:
model = LogisticRegression()
tf = CountVectorizer()
sc = StandardScaler()

pipeline = Pipeline([('tfidf',tf),
#                      ('sc',sc),
                     ('model',model)])
    
# Split Data
x =train['cl_text'].copy()
y =train.target.copy()
x_train ,x_test ,y_train,y_test =train_test_split(x,y,test_size=0.2,random_state=42)

# Train & Test Pipe
pipeline.fit(x_train,y_train)
y_hat =pipeline.predict(x_test)
print(classification_report(y_test,y_hat))

In [None]:
def Train_and_evaluate_pipeline(pipe):
    # Create Pipe
    pipeline = Pipeline(pipe)
    
    # Split Data
    x =train.text.copy()
    y =train.target.copy()
    x_train ,x_test ,y_train,y_test =train_test_split(x,y,test_size=0.2,random_state=42)
    
    # Train & Test Pipe
    pipeline.fit(x_train,y_train)
    y_hat =pipeline.predict(x_test)
    print(classification_report(y_test,y_hat))
    return pipeline

In [None]:
model = LogisticRegression()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('hash',drop_hash_pipe),
                     ('tf_idf',TfidfVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = RandomForestClassifier()
pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('hash',drop_hash_pipe),
                     ('tf_idf',TfidfVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = DecisionTreeClassifier()
pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',TfidfVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

# Best Score with SVC 82%

In [None]:
model = SVC()
pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
#                      ('hash',drop_hash_pipe),

                     ('tf_idf',TfidfVectorizer()),
                     ('model',model)]
pipeline_svc = Train_and_evaluate_pipeline(pipeline)
# 82% 

In [None]:
model = AdaBoostClassifier()
pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',TfidfVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = GradientBoostingClassifier()
pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',TfidfVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = MultinomialNB()
pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',TfidfVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = KNeighborsClassifier()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',TfidfVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = LogisticRegression()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',CountVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = RandomForestClassifier()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',CountVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = KNeighborsClassifier()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',CountVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = AdaBoostClassifier()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',CountVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = GradientBoostingClassifier()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',CountVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = SVC()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',CountVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = MultinomialNB()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',CountVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

In [None]:
model = DecisionTreeClassifier()

pipeline = [('lower',to_lower_pipe),
                     ('retweet',drop_retweet_pipe),
                     ('urls',drop_links_pipe),
                     ('lematize',lematize_pipe),
                     ('spacs',clean_spaces_pipe),
                     ('tf_idf',CountVectorizer()),
                     ('model',model)]
Train_and_evaluate_pipeline(pipeline)

# LSTM

In [None]:
# x = train.text.copy()
# x = to_lower_case(x)
# x = drop_retweet(x)
# x = drop_url(x)
# x = lemtize_tweets(x)
# x = clean_spaces(x) 
# y =train.target.copy()

# tokenize = Tokenizer()
# tokenize.fit_on_texts(x)
# x = tokenize.texts_to_sequences(x)
# print(len(tokenize.word_index)+1)
# vocab_length =len(tokenize.index_word)+1
# x = pad_sequences(x,maxlen=np.max(length))
# x_train ,x_test ,y_train,y_test =train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
# model =Sequential()
# # mode
# model.add(Embedding(input_length=vocab_length,output_dim=100,input_dim=vocab_length))
# model.add(Bidirectional(LSTM(100)))

# # model.add(Dropout(0.4))
# # model.add(Dense(4,activation='relu'))
# # model.add(Dense(32,activation='relu'))
# # model.add(Dense(32,activation='relu'))

# model.add(Dense(1,activation='sigmoid'))
# epochs = 1
# lr_schedule = keras.optimizers.schedules.ExponentialDecay(
#     initial_learning_rate=1e-2,
#     decay_steps=1000,
#     decay_rate=0.9)
# model.compile(optimizer=keras.optimizers.Adam(learning_rate=lr_schedule)
#               ,loss='binary_crossentropy',metrics=['accuracy'])


In [None]:
# history = model.fit(x_train,y_train,
# #                     batch_size=32,validation_split=.2,
#                     epochs=1,verbose=1)

In [None]:
# model.evaluate(x_test,y_test)

In [None]:
# from sklearn.metrics import accuracy_score
# y_hat = model.predict(x_test)
# print(classification_report(y_test,np.round(y_hat)))
# print(accuracy_score(y_test,np.round(y_hat)))

In [None]:
# test = pd.read_csv(files['test'])
# test.head()
# test.isna().sum()
# x = test['text']
# test_prediction = pipeline_svc.predict(x)
# sample = pd.read_csv(files['sample'],index_col='id')
# sample
# sample['target'] = test_prediction
# 
# sample.to_csv('sample_submission.csv')

# BERT PYTORCH

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer , AutoModel , BertModel , BertTokenizer
from transformers import AdamW ,get_linear_schedule_with_warmup

In [None]:
MODEL_NAME ='bert-base-cased'
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)

In [None]:
sample = tokenizer.encode_plus('hi',add_special_tokens=True,truncation=True, max_length=200
                      ,return_length='max_length',pad_to_max_length =True 
                     )
sample.keys()

In [None]:
class Data_Model:
    def __init__(self,data,target,max_length,tokenizer):
        self.data       = data
        self.target     = target
        self.max_length = max_length
        self.tokenizer  = tokenizer
    
    def __len__(self):
        return len(self.target)
    
    def __getitem__(self,idx):
        sample = self.data[idx]
        sample_label = self.target[idx]
        
        encoding_sample = self.tokenizer.encode_plus(
          sample,
          add_special_tokens=True,
          max_length=self.max_length,
          return_token_type_ids=True,
          pad_to_max_length=True,
          return_attention_mask=True,
          truncation=True,
          return_tensors='pt',
        
        )
        return {
            'text':sample,
            'input_ids':encoding_sample['input_ids'].flatten(),
            'attention_mask':encoding_sample['attention_mask'].flatten(),
            'token_type_ids':encoding_sample['token_type_ids'].flatten(),
            'label': torch.tensor(sample_label)
               }

In [None]:
# train.iloc[0].values

In [None]:
x = train.cl_text.values
y = train.target.values
x_train,x_test,y_train ,y_test= train_test_split(x,y,random_state=42,test_size=0.3)

# Create Data Class

In [None]:
max_len = 150
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
train_data = Data_Model(data=x_train,target=y_train,max_length=max_len,tokenizer=tokenizer)
test_data = Data_Model(data=x_test,target=y_test,max_length=max_len,tokenizer=tokenizer)

In [None]:
batch = 32
train_data_loader = torch.utils.data.DataLoader(train_data,batch_size=batch)
test_data_loader = torch.utils.data.DataLoader(test_data,batch_size=batch)

In [None]:
class Bert_Classification_Model(nn.Module):
    def __init__(self,n_classes):
        super(Bert_Classification_Model,self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME) 
        self.out  = nn.Linear(self.bert.config.hidden_size,n_classes)
    
    def forward(self,input_ids,attention_mask,token_type_ids):
        _ , pred =self.bert( input_ids = input_ids,
                             attention_mask = attention_mask,
                             token_type_ids = token_type_ids,
                             return_dict=False
                           )
        out = self.out(pred)
        
        return out
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model  = Bert_Classification_Model(2)
model  = model.to(device)

In [None]:
EPOCHS = 1
# Loss Function
loss = nn.CrossEntropyLoss().to(device)

# optimizer
opt  = AdamW(model.parameters(), 2e-5,correct_bias=False)

# total step to scheduler learning rate
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
  opt,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

In [None]:
def train_model(model,data_loader,loss,optimizer,device,scheduler,n_example):
#     model = model.train()
    losess = []
    correct_y_hat = 0
    
    for d,(data) in enumerate(data_loader):
        input_ids = data['input_ids'].to(device)
        attention_mask = data['attention_mask'].to(device)
        token_type_ids = data['token_type_ids'].to(device)
        targets = data['label'].to(device)
        
        # pass data to model
        y_h = model(input_ids = input_ids ,
                    attention_mask = attention_mask ,
                    token_type_ids = token_type_ids,
                   )
        #compute loss
        l = loss(y_h,targets)
        
        # append loss value to list 
        losess.append(l.item())
        
        # backword
        l.backward()
        
        optimizer.step()
        scheduler.step()
        
        optimizer.zero_grad()
        
        # compute correct prediction
        _, preds = torch.max(y_h, dim=1)
#         print(preds) list of zero and 1
#         print(y_h) # list pf array with three possible value o

        correct_y_hat += torch.sum(preds == targets)
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        if d % 50 == 0:
            print('batch : ' ,d ,' END')
    print('loss mean : ',np.mean(losess),' acc :',correct_y_hat/n_example)
    
    return losess , correct_y_hat

In [None]:
def eval_model(data_loader):
    prediction = 0
    with torch.no_grad():
        for sample  in data_loader:
            inputs = sample['input_ids'].to(device)
            attention =sample['attention_mask'].to(device)
            token_type = sample['token_type_ids'].to(device)
            label  = sample['label'].to(device)
            y_h = model(input_ids=inputs,attention_mask=attention,token_type_ids=token_type)

            
            # get index of max value of each observation [0,1,2]
            _ , pred = torch.max(y_h,dim=1)
            
            # sum correct values 
            prediction += torch.sum(pred == label)
            
    print('test accuracy : ',prediction/len(test_data))

In [None]:
EPOCHS = 3
for epoch in range(EPOCHS):
    print('Epoch ',epoch+1)
    print('----------------------------------')
    print()
    # Train The Model
    losess , correct_y_hat = train_model(model,train_data_loader,loss,opt,device,scheduler,len(train_data))
    print()
    # Test Model with unseen Data
    eval_model(test_data_loader)
    print('----------------------------------')
    print()

In [None]:
# load TEST DATA
test = pd.read_csv(files['test'])

sample = pd.read_csv(files['sample'])

# Cliean TEST Data
test["cl_text"] = test["text"].apply(clean_tweet)

In [None]:
class Data_Model_prediction:
    def __init__(self,data,tokenizer):
        self.data       = data
        self.tokenizer  = tokenizer
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self,idx):
        sample = self.data[idx]
        encoding_sample = self.tokenizer.encode_plus(
          sample,
          add_special_tokens=True,
          max_length=150,
          return_token_type_ids=True,
          pad_to_max_length=True,
          return_attention_mask=True,
          truncation=True,
          return_tensors='pt',
        
        )
        return {
            'text':sample,
            'input_ids':encoding_sample['input_ids'].flatten(),
            'attention_mask':encoding_sample['attention_mask'].flatten(),
            'token_type_ids':encoding_sample['token_type_ids'].flatten(),
               }

In [None]:
max_len = 150
tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
test_data_prediction = Data_Model_prediction(data=test.cl_text,tokenizer=tokenizer)

In [None]:
test_data_prediction_loader = torch.utils.data.DataLoader(test_data_prediction,batch_size=batch)

In [None]:
prediction_test = []
with torch.no_grad():
    for sample  in test_data_prediction_loader:
        inputs = sample['input_ids'].to(device)
        attention =sample['attention_mask'].to(device)
        token_type = sample['token_type_ids'].to(device)
        y_h = model(input_ids=inputs,attention_mask=attention,token_type_ids=token_type)


        # get index of max value of each observation [0,1,2]
        _ , pred = torch.max(y_h,dim=1)
        prediction_test.extend(pred)

In [None]:
sample = pd.read_csv(files['sample'])
sample

In [None]:
sample.target[:10] , prediction_test[:10]

In [None]:
np.array(prediction_test)

In [None]:
sample['predict'] =np.array(prediction_test)

In [None]:
train.target.value_counts()

In [None]:
sample

In [None]:
# class custom_model(nn.Module):
#     def __init__(self,):
#         super(custom_model,self).__init__()
#         self.bert = AutoModel.from_pretrained('bert-base-cased')
#         self.clf  = nn.Linear(768,2)
    
#     def forward(self,inputs_ids,attenstion_mask,token_type_ids):
#         outputs = self.bert(inputs_ids,attenstion_mask,token_type_ids)
        
#         pooler_output =outputs.pooler_output
        
#         logits = self.clf(pooler_output).squeeze(-1)
#         return logits

In [None]:
# model = custom_model()
# optimizer = torch.optim.AdamW(model.parameters())
# criterion = nn.CrossEntropyLoss()

In [None]:
# MAX_LEN = 100
# train_loss = 0
# for batch in train_data_loader:
#     optimizer.zero_grad()
#     text = batch[0]
#     label = batch[1].long()
#     encoded = tokenizer.batch_encode_plus(
#                   list(text),
#                   padding='max_length',
#                   max_length=MAX_LEN,
#                   truncation=True,
#                   return_tensors='pt',
#                   return_attention_mask=True,  
#                   return_token_type_ids=True)
#     input_ids=encoded['input_ids']
#     attention_mask=encoded['attention_mask']
#     token_type_ids=encoded['token_type_ids']
#     preds=model(input_ids, attention_mask, token_type_ids)
#     loss=criterion(preds, label)
#     loss.backward()
#     optimizer.step()
# #     train_loss += loss.item()
# train_loss/=len(train_data_loader)
# print(train_loss)

In [None]:
# x_train.iloc[0]