# NLP Machine Learning Project 2022

Useful links:

https://towardsdatascience.com/natural-language-processing-nlp-for-machine-learning-d44498845d5b
https://www.andyfitzgeraldconsulting.com/writing/keyword-extraction-nlp/

In [1]:
# pip install nltk

In [41]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
# from term_frequency import term_frequencies, feature_names, df_term_frequencies

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
transformer = TfidfTransformer()
tt = TweetTokenizer()

In [3]:
# RUN this only for the first time and then comment:

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

In [4]:
df = pd.read_csv(r'train.csv')

df.replace('NaN', np.NaN, inplace = True)

# to count the number of NaN's in each column, just change the column name in this line to see how many missing values of that
# variable per other column
print(df[df.keyword.isnull()].count())

df.head()

id          61
keyword      0
location     0
text        61
target      61
dtype: int64


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
##Removing punctuation
import string

def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

# df["text_clean"] = df["text"].apply(lambda x: remove_punct(x))

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [44]:
stop = set(stopwords.words('english'))

# divides tweet into list of its words
df['tokenized_tweet'] = df.apply(lambda row: tt.tokenize(row['text']), axis=1)

# 'stemmer' reduces all words to their stems by bluntly cutting off prefixes - not useful with names, etc.
df['stemmed_tweet'] = df['tokenized_tweet'].apply(lambda x: [stemmer.stem(y) for y in x])

# 'lemmatizer' gets rid of plurals, etc., it's gentler than stemming
df['lemmatized_tweet'] = df['tokenized_tweet'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x]) 

# removes 'stop words' such as 'the', 'are', etc. it knows these stop words where i defined 'stop' variable, comes from a library
df['tweet_stop'] = df['lemmatized_tweet'].apply(lambda x: [y for y in x if y not in stop])

# hashtags
df['hashtag'] = df.text.apply(lambda x: re.findall(r"#(\w+)", x))

In [45]:
df.head()

Unnamed: 0,id,keyword,location,text,target,tokenized_tweet,stemmed_tweet,lemmatized_tweet,tweet_stop,hashtag
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, #eart...","[our, deed, are, the, reason, of, thi, #earthq...","[Our, Deeds, are, the, Reason, of, this, #eart...","[Our, Deeds, Reason, #earthquake, May, ALLAH, ...",[earthquake]
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask, ., Canada]","[forest, fire, near, la, rong, sask, ., canada]","[Forest, fire, near, La, Ronge, Sask, ., Canada]","[Forest, fire, near, La, Ronge, Sask, ., Canada]",[]
2,5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, ', shelter, in, pl...","[all, resid, ask, to, ', shelter, in, place, '...","[All, resident, asked, to, ', shelter, in, pla...","[All, resident, asked, ', shelter, place, ', n...",[]
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #wildfires, evacuati...","[13,000, peopl, receiv, #wildfir, evacu, order...","[13,000, people, receive, #wildfires, evacuati...","[13,000, people, receive, #wildfires, evacuati...",[wildfires]
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, #Al...","[just, got, sent, thi, photo, from, rubi, #ala...","[Just, got, sent, this, photo, from, Ruby, #Al...","[Just, got, sent, photo, Ruby, #Alaska, smoke,...","[Alaska, wildfires]"


In [46]:
def get_part_of_speech():
    POS = wordnet.synsets(word)

In [47]:
df.tweet_stop = df.tweet_stop.apply(lambda x: [' '.join(str(y)) for y in x])
print(df.tweet_stop)

0       [O u r, D e e d s, R e a s o n, # e a r t h q ...
1       [F o r e s t, f i r e, n e a r, L a, R o n g e...
2       [A l l, r e s i d e n t, a s k e d, ', s h e l...
3       [1 3 , 0 0 0, p e o p l e, r e c e i v e, # w ...
4       [J u s t, g o t, s e n t, p h o t o, R u b y, ...
                              ...                        
7608    [T w o, g i a n t, c r a n e, h o l d i n g, b...
7609    [@ a r i a _ a h r a r y, @ T h e T a w n i e ...
7610    [M 1, ., 9 4, [, 0 1 : 0 4, U T C, ], ?, 5 k m...
7611    [P o l i c e, i n v e s t i g a t i n g, e - b...
7612    [T h e, L a t e s t, :, M o r e, H o m e s, R ...
Name: tweet_stop, Length: 7613, dtype: object


In [49]:
# this is my attempt at creating a matrix of term frequencies using tf-idf but the problem is that
# a lot of the 'words' people use are literally jibberish

from sklearn.feature_extraction.text import CountVectorizer

df.tweet_stop = df.tweet_stop.apply(lambda x: [''.join(str(y)) for y in x])
# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform(df.text)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Tweet {i+1}" for i in range(len(df.tokenized_tweet))]

# create pandas DataFrame with term frequencies
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=corpus_index)

# df_term_frequencies.head(30)
print(df_term_frequencies.iloc[:])

df_term_frequencies['frequency_summation'] = df_term_frequencies.iloc[:].sum(axis=1)
print(df_term_frequencies.iloc[:])

df_term_frequencies = df_term_frequencies[df_term_frequencies['frequency_summation'] >= 2]
# df_term_frequencies = df_term_frequencies.loc[df_term_frequencies.frequency_summation >= 5]
print(df_term_frequencies.iloc[:])

              Tweet 1  Tweet 2  Tweet 3  Tweet 4  Tweet 5  Tweet 6  Tweet 7  \
00                  0        0        0        0        0        0        0   
000                 0        0        0        1        0        0        0   
0000                0        0        0        0        0        0        0   
007npen6lg          0        0        0        0        0        0        0   
00cy9vxeff          0        0        0        0        0        0        0   
...               ...      ...      ...      ...      ...      ...      ...   
ûóher               0        0        0        0        0        0        0   
ûókody              0        0        0        0        0        0        0   
ûónegligence        0        0        0        0        0        0        0   
ûótech              0        0        0        0        0        0        0   
ûówe                0        0        0        0        0        0        0   

              Tweet 8  Tweet 9  Tweet 10  ...  Twee

In [None]:
# DO NOT RUN THIS
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = 
        train_test_split(features, labels, test_size = 0.25, random_state = 42)

In [None]:
# Wrapper method - selma DO NOT RUN THIS

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


sfs = SFS(RandomForestClassifier(n_jobs=-1), 
           k_features=(3, 15),
           forward=True, 
           floating=False, 
           scoring='roc-auc',
           cv=5)

pipe = make_pipeline(StandardScaler(), sfs)

pipe.fit(X_train, y_train)

print('best combination (ACC: %.3f): %s\n' % (sfs.k_score_, sfs.k_feature_idx_))
print('all subsets:\n', sfs.subsets_)
plot_sfs(sfs.get_metric_dict(), kind='std_err');