In [None]:
#importing libraries
import pandas as pd
import numpy as np
from warnings import simplefilter
simplefilter("ignore")

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import chardet #evaluate encoding of csv
! pip install keybert
from keybert import KeyBERT
import re
import string
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords        
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.stem import PorterStemmer
nltk.download("vader_lexicon") # load the Lexicon that quantifies polar sentiment (positive/negative)

from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
# from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split

In [None]:
# look at the first ten thousand bytes to guess the character encoding
with open("/kaggle/input/nlp-getting-started/train.csv", 'rb') as rawdata:
    result = chardet.detect(rawdata.read(100000))

# check what the character encoding might be
print(result)

In [None]:
df_train = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
X_test = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

# EDA

In [None]:
df_train.head()

In [None]:
df_train.describe(include="all").T

In [None]:
df_train.info()

In [None]:
#visualize the missing values
ax, fig = plt.subplots(figsize=(10,10))

ax = sns.heatmap(df_train.isna(),yticklabels=False,cbar=False,cmap='BuGn_r', alpha = 0.9)

plt.xticks(rotation = 45, ha="right")

plt.tight_layout()

#note that low number of missings won't show in the chart

## ID

In [None]:
df_train["id"]
#not apporting any info we can drop it

## Keyword

In [None]:
df_train["keyword"].describe()

In [None]:
#has some nan values
df_train["keyword"].isna().value_counts(normalize=True)
# 0.8% (61) are missing - we will proceed imputing these values using KeyBERT

To avoid data leakage we need to train test split the data first and then create the list from the train set of unique keywords

Let's go forwars keeping analyzing the data and then we will build a function to tranform the Keyword column. Let's have a look at the kind of keywords we are dealing with:

In [None]:
df_train[~df_train["keyword"].isna()]["keyword"].unique().tolist()[:30] #remove this slicing if you want to explore all the list

We can see that most of the spaces are wrlngly encoded with %20. Also, there are words that are similar, we will take care of them when dealing with the text feature.

## Text

In [None]:
df_train["text"].values.tolist()[:30] #remove this slicing if you want to explore all the list

As we can see here we have different porblems:
1. There are a multitude of links. We don't need them but we will create a new feature "has_link"
2. Some words have been encoded incorrectly eg. \x89Ûªt
3. Tabs and escape need to be removed
4. People tags are not useful. We will take them out and add a new feature "has_tag"
5. Ashtags will be removed. We will create a new feature "ashtags" that we will then merge with the keyword feature creating "keyword+". Also we will normalize the keywords using stemming
6. We will create in the end a new feature based on the sentiment score using the compound value
7. After we will normalize the text applying punctuation and stopwrods removal and stemming


## Location

In [None]:
df_train["location"]
#lots of values missing

In [None]:
df_train["location"].isna().value_counts(normalize=True)*100
#33%  missing locations

In [None]:
df_train[~df_train["location"].isna()]["location"].values.tolist()[:30] #remove this slicing if you want to explore all the list

more than 15% of values missing and not very useful info from this feature - we will drop it and create a new featur "has_location"

## Target

In [None]:
df_train["target"].describe()

In [None]:
df_train["target"].value_counts(normalize=True)*100
#42% are disaster

# Feature Transformation and Engineering

In [None]:
#splitting data
y = df_train["target"]
# X.drop("target", axis=1, inplace=True)
X_train, X_valid, y_train, y_valid = train_test_split(df_train.drop("target", axis=1), y, test_size=0.25, random_state=42)

In [None]:
def keyword_extractor(text):
    """
    This function extracts keywords from text using KeyBERT
    
    It ueses a list keywords candidates where to chose from.
    Maximal Marginal Relevance (MMR) set to True
    Diversity is set to 0.2
    Top n keywords/keyphrases is set to 6
    
    Args:
        text (str): text to porcess

    Returns:
        list: list of keywords
    """
    

    keywords = kw_model.extract_keywords(
        text, keyphrase_ngram_range=(1,1),
        stop_words="english",
        candidates=kwd_list,
        use_mmr=True,
        diversity=0.2,
        top_n=1
        )
    
    return "".join([str(i[0]) for i in keywords])

In [None]:
def location_transformer(df):
    """
    Creating new feature "has_location"
    """
    
    df["has_location"] = np.where(df["location"].isna(), 0,1)
    

In [None]:
    #to avoid leaking we'll use X_train to make the unique keywords list
    global kwd_list
    kwd_list = X_train["keyword"].unique().tolist() 
    kwd_list.remove(np.nan) #removing nan value from the list
    
    #removing %20 from the list and substituting with a space
    pattern="%20"
    regex = re.compile(pattern)
    kwd_list = [regex.sub(" ", i) if "%20" in i else i for i in kwd_list]

In [None]:
def kwd_transformer(df):
    """
    1. Removing %20 from keywords
    2. Imputing missing keywords with keyBERT
    3. Changing wild fire to wildfires
    """

    
    #removing space encoding
    df["keyword"] = df["keyword"].str.replace("%20", " ")
    print("removed space encoding")
    
    #imputing missing keywords
    
    global kw_model
    kw_model = KeyBERT()
    
    df["keyword"] = df.apply(lambda x: keyword_extractor(x["text"]) if x["keyword"] is np.nan else x["keyword"], axis=1)
    print("Imputed missing Keywords")
    
    #making wildfires an unique word
    df["keyword"] = df["keyword"].apply(lambda x: "wildfires" if x=="wild fires" else x)
    print("made 'wild fires'=='wildfires' ")
    


In [None]:
def polarity(row):
    """
    Creating Polarity score feature
    """
    
    pol = analyzer.polarity_scores(row)
    compound = pol["compound"]
    
    return pd.Series([compound])

In [None]:
def text_transformer(df):
    """
    1. Removing links form text and creating a feature "has_link"
    2. Creating a text length feature and binning it in 2 chunks - 
        this is because short text tweets are more prone to have target 1
    3. Creating a "has_hastag" feature. We will then remove "#" from text and merge the ashtags
        in a unique feature with keywords and stem it
    4. Removing all the tags from the text and creating a feature "has_tag"
    5. Removing all tabs characters from the thext
    6. Creating a compound sentiment feature
    """
    
    # 1------LINKS
    #creating new feature has_link
    #creating the regex link pattern
    link_pattern = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&\/\/=]*)"
    link = re.compile(link_pattern)
    
    #creating feature
    df["has_link"] = df["text"].str.contains(link, regex=True).astype(int)
    
    #removing links from the text
    df["text"] = df["text"].str.replace(link,"", regex=True)
    
    print("links done")
    
    # 2--------LENGHT
    #creating new feature text_len
    df["text_len"] = df["text"].apply(lambda x: len(x))
    
    #creating 5 bins for the lenght
    df["text_len_bins"] = pd.cut(df['text_len'].astype(int), 2) #140chr / 2
    #encoding the bins
    label = LabelEncoder()
    df["text_len_bins"] = label.fit_transform(df["text_len_bins"])
    
    print("len done")
    
    # 3-------ASHTAGS
    #creating ashtag has ashtag feature and ashtags feature
    #ashtags pattern
    ashtag_pattern = r"#(\w+)"
    ashtag = re.compile(ashtag_pattern)
    
    #create feature if it has hashtags
    df["has_ashtags"] = df["text"].str.contains(ashtag, regex=True).astype(int)
    
    #create ashtags feature
    df["ashtags"] = df["text"].apply(lambda x: re.findall(ashtag, x.lower()))
    
    #merging keywords and ashtags
    df["keyword+"] = df.apply(lambda x: [x["keyword"]]+x["ashtags"] if x["keyword"] not in x["ashtags"] else x["ashtags"],axis=1) #removing duplicates and merging columns
    
    df["keyword+"] = df["keyword+"].apply(lambda x: " ".join(x))
    
    #stemming keywords
    global stemmer
    stemmer = PorterStemmer()
    df["keyword+"] = df["keyword"].apply(lambda x : stemmer.stem(x))
    
    #replace ashtag with word without #
    df["text"] = df["text"].str.replace("#","", regex=False)
    
    print("ashtags done")
    
    # 4-------TAGS
    #creating has_tag feature
    
    #tag pattern
    tag_pattern = r"@(\w+)"
    tag = re.compile(tag_pattern)
    
    #add has tag
    df["has_tag"] = df["text"].str.contains(tag, regex=True).astype(int)
    
    #remove all @person from the text feature
    df["text"] = df["text"].str.replace(tag,"", regex=True)
    
    print("tags done")
    
    # 5------\n
    #replacing \n
    df["text"] = df["text"].str.replace("\n"," ", regex=False)
    
    # 6-------SENTIMENT SCORE
    #add sentiment compound feature
    global analyzer
    analyzer = SentimentIntensityAnalyzer()
    
    df["compound"] = df.apply(lambda row: polarity(row["text"]), axis=1)
    
    print("sentiment done")
    

In [None]:
kwd_transformer(X_train)
text_transformer(X_train)
location_transformer(X_train)

In [None]:
df=pd.concat([X_train,y_train], axis=1)
df.head()

In [None]:
sns.histplot(data=df, x="text_len", hue="target", bins=14) 
#we can see a distinction between text shorter than 60 chars being less frequent

In [None]:
sns.boxplot(data=df, x="target", y="text_len") # 1 are slightly longer

In [None]:
sns.barplot(data=df, x="text_len_bins",y="text_len",hue="target", orient="v") 
#distaster tweets are longer for bin one we created

In [None]:
sns.boxplot(data=df, x="target", y="compound") #disaster tweets are clearly more proce to ha a negative compound sentiment

In [None]:
#visualizing keywords with most disaster labels
order = pd.crosstab(df["keyword+"], df.target).sort_values(1, ascending=False).index

In [None]:
fig, ax = plt.subplots(figsize=(10,35)
                       )
ax = sns.countplot(data=df, y="keyword+", hue="target", orient="v", order = order)

ax.xaxis.tick_top()
ax.xaxis.set_label_position('top')

plt.legend(loc='upper right')
plt.tight_layout()

In [None]:
sns.countplot(data=df, x="has_ashtags", hue="target")
# not having an hashtag could be a discriminant for a disaster tweet

In [None]:
sns.countplot(data=df, x="has_tag", hue="target")
# a tag could be a discriminant eve tough we must take into account that the disaster tweets are less than non disaster

In [None]:
sns.countplot(data=df, x="has_location", hue="target")
# same as before

In [None]:
sns.countplot(data=df, x="has_link", hue="target")
# here we can see how having a link could be indicative of a disaster

In [None]:
kwd_transformer(X_valid)
text_transformer(X_valid)
location_transformer(X_valid)

In [None]:
kwd_transformer(X_test)
text_transformer(X_test)
location_transformer(X_test)

In [None]:
print(X_train.shape,y_train.shape, X_valid.shape, y_valid.shape, X_test.shape)

Changing categorical to numerical features:
1. One Hot Encode Keywords+
2. TD-IDF Vectorization fo the text feature to apply machine learning algorithm

In [None]:
encoder = OneHotEncoder(handle_unknown="ignore", sparse=False)

#TRAIN
train_kwd_dummies = pd.DataFrame(encoder.fit_transform(X_train["keyword+"].values.reshape(-1,1)))
X_train.reset_index(inplace=True) #needs index to be resetted with concat

#concat
X_train = pd.concat([X_train, train_kwd_dummies], axis=1)

X_train.index = X_train["index"]
X_train.drop("index", axis=1, inplace=True)


In [None]:
valid_kwd_dummies = pd.DataFrame(encoder.transform(X_valid["keyword+"].values.reshape(-1,1)))

#VALID
X_valid.reset_index(inplace=True) #needs index to be resetted

#concat
X_valid = pd.concat([X_valid, valid_kwd_dummies], axis=1)

X_valid.index = X_valid["index"]
X_valid.drop("index", axis=1, inplace=True)

In [None]:
test_kwd_dummies = pd.DataFrame(encoder.transform(X_test["keyword+"].values.reshape(-1,1)))

#TEST
X_test.reset_index(inplace=True) #needs index to be resetted

#concat
X_test = pd.concat([X_test, test_kwd_dummies], axis=1)

X_test.index = X_test["index"]
X_test.drop("index", axis=1, inplace=True)

In [None]:
print(X_train.shape,y_train.shape, X_valid.shape, y_valid.shape, X_test.shape)

In [None]:
def text_process(text):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Stem the text
    4. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in text if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    #Stem text
    stemmer = PorterStemmer()
    
    # Now just remove any stopwords
    return [stemmer.stem(word) for word in nopunc.split() if word.lower() not in stopwords.words('english')]


In [None]:
#TRAINING
#vectorizing text columns
vectorizer = TfidfVectorizer(analyzer=text_process, min_df=5, max_df=0.5, ngram_range=(1,2))

train_text_tfidf = vectorizer.fit_transform(X_train['text'])

#making a dataframe to concatenate
train_text_tfidf = pd.DataFrame(train_text_tfidf.toarray())

X_train.reset_index(inplace=True) #needs index to be resetted

#concat
X_train = pd.concat([X_train, train_text_tfidf], axis=1)

X_train.index = X_train["index"]
X_train.drop("index", axis=1, inplace=True)

In [None]:
#VALIDATION
valid_text_tfidf = vectorizer.transform(X_valid['text']) #avoid data leakage

#making a dataframe to concatenate
valid_text_tfidf = pd.DataFrame(valid_text_tfidf.toarray())

X_valid.reset_index(inplace=True) #needs index to be resetted

#concat
X_valid = pd.concat([X_valid, valid_text_tfidf], axis=1)

X_valid.index = X_valid["index"]
X_valid.drop("index", axis=1, inplace=True)

In [None]:
#TEST
test_text_tfidf = vectorizer.transform(X_test['text']) #avoid data leakage

#making a dataframe to concatenate
test_text_tfidf = pd.DataFrame(test_text_tfidf.toarray())

X_test.reset_index(inplace=True) #needs index to be resetted

#concat
X_test = pd.concat([X_test, test_text_tfidf], axis=1)

X_test.index = X_test["index"]
X_test.drop("index", axis=1, inplace=True)

In [None]:
#scaling the compound feature
from sklearn.preprocessing import MinMaxScaler
scaler=MinMaxScaler()

In [None]:
scaler.fit(X_train["compound"].values.reshape(-1,1))
# scaler.data_max_

In [None]:
X_train["compound"] = scaler.transform(X_train["compound"].values.reshape(-1,1))
X_valid["compound"] = scaler.transform(X_valid["compound"].values.reshape(-1,1))
X_test["compound"] = scaler.transform(X_test["compound"].values.reshape(-1,1))

In [None]:
def drop_columns(df):
   return df.drop(["keyword", "id", "ashtags", "location", "text_len", "keyword+", "text"], axis=1, inplace=True)

In [None]:
#dropping non necessary features
drop_columns(X_train)
drop_columns(X_valid)
drop_columns(X_test)

In [None]:
print(X_train.shape,y_train.shape, X_valid.shape, y_valid.shape, X_test.shape)

In [None]:
#pca didn't give any improvement
# pca = PCA(n_components=6, random_state=42)

In [None]:

# X_train_pca = pca.fit_transform(X_train)
# X_valid_pca=pca.transform(X_valid)

In [None]:
# pca.explained_variance_
#only first 6 explain variance

In [None]:
# print(X_train_pca.shape,y_train.shape, X_valid_pca.shape, y_valid.shape)

# Model Evaluation

In [None]:
from sklearn.linear_model import LogisticRegression

clf = LogisticRegression(C=1,random_state=42).fit(X_train, y_train)

In [None]:
prediction = clf.predict(X_valid)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, f1_score

print(confusion_matrix(y_valid, prediction))
print(accuracy_score(y_valid, prediction))
print(classification_report(y_valid, prediction))
print(f1_score(y_valid, prediction))


In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, random_state=42)


In [None]:
rfc.fit(X_train,y_train)

In [None]:
prediction = rfc.predict(X_valid)

In [None]:
print(confusion_matrix(y_valid, prediction))
print(accuracy_score(y_valid, prediction))
print(classification_report(y_valid, prediction))
print(f1_score(y_valid, prediction))

In [None]:
from sklearn.svm import SVC
svm=SVC()

In [None]:
svm.fit(X_train,y_train)

In [None]:
prediction = svm.predict(X_valid)

In [None]:
print(confusion_matrix(y_valid, prediction))
print(accuracy_score(y_valid, prediction))
print(classification_report(y_valid, prediction))
print(f1_score(y_valid, prediction))

In [None]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
nb.fit(X_train,y_train)

In [None]:
prediction = nb.predict(X_valid)

print(confusion_matrix(y_valid, prediction))
print(accuracy_score(y_valid, prediction))
print(classification_report(y_valid, prediction))
print(f1_score(y_valid, prediction))

Random Forest performed slightly better than the onthers classifiers. We will fine tune this model:

In [None]:
from sklearn.model_selection import GridSearchCV, cross_validate, KFold
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [None]:
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

kfold = KFold(n_splits=10)

results = cross_validate(estimator=clf,
                            X=X_train,
                            y=y_train,
                            cv=kfold,
                            scoring=scoring)


print("Accuracy: {:.2f} %".format(results["test_accuracy"].mean()*100))
print("Standard Deviation: {:.2f} %".format(results["test_accuracy"].std()*100))
print("F1_score: {:.2f} %".format(results["test_f1_score"].mean()*100))
print("Precision: {:.2f} %".format(results["test_precision"].mean()*100))
print("Recall: {:.2f} %".format(results["test_recall"].mean()*100))


In [None]:
parameters = [{'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] }]
grid_search = GridSearchCV(estimator = clf,
                           param_grid = parameters,
                           scoring = 'f1',
                           cv = 5,
                           n_jobs = -1,
                           verbose=2)
grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
print("Best f1: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

In [None]:
test_prediction = clf.predict(X_test)

In [None]:
test_prediction

In [None]:
submission = pd.DataFrame({'id': pd.read_csv("/kaggle/input/nlp-getting-started/test.csv").id.values,'target': test_prediction})


In [None]:
submission.to_csv("submission.csv", index=False)

In [None]:
len(submission)