# NLP Machine Learning Project 2022

Useful links:

https://towardsdatascience.com/natural-language-processing-nlp-for-machine-learning-d44498845d5b
https://www.andyfitzgeraldconsulting.com/writing/keyword-extraction-nlp/

In [147]:
# pip install nltk

In [148]:
import nltk
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

In [149]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [150]:
df = pd.read_csv(r'train.csv')

df.replace('NaN', np.NaN, inplace = True)

# to count the number of NaN's in each column, just change the column name in this line to see how many missing values of that
# variable per other column
print(df[df.keyword.isnull()].count())

df.head()

id          61
keyword      0
location     0
text        61
target      61
dtype: int64


Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [151]:
##Removing punctuation
import string

def remove_punct(text):
    text_nopunct = "".join([char for char in text if char not in string.punctuation])
    return text_nopunct

# df["text_clean"] = df["text"].apply(lambda x: remove_punct(x))

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [152]:
stop = set(stopwords.words('english'))

# divides tweet into list of its words
df['tokenized_tweet'] = df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

# 'stemmer' reduces all words to their stems by bluntly cutting off prefixes - not useful with names, etc.
df['stemmed_tweet'] = df['tokenized_tweet'].apply(lambda x: [stemmer.stem(y) for y in x])

# 'lemmatizer' gets rid of plurals, etc., it's gentler than stemming
df['lemmatized_tweet'] = df['tokenized_tweet'].apply(lambda x: [lemmatizer.lemmatize(y) for y in x]) 

# removes 'stop words' such as 'the', 'are', etc. it knows these stop words where i defined 'stop' variable, comes from a library
df['tweet_stop'] = df['lemmatized_tweet'].apply(lambda x: [y for y in x if y not in stop])



In [153]:
df.head()

Unnamed: 0,id,keyword,location,text,target,tokenized_tweet,stemmed_tweet,lemmatized_tweet,tweet_stop
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,"[Our, Deeds, are, the, Reason, of, this, #, ea...","[our, deed, are, the, reason, of, thi, #, eart...","[Our, Deeds, are, the, Reason, of, this, #, ea...","[Our, Deeds, Reason, #, earthquake, May, ALLAH..."
1,4,,,Forest fire near La Ronge Sask. Canada,1,"[Forest, fire, near, La, Ronge, Sask, ., Canada]","[forest, fire, near, la, rong, sask, ., canada]","[Forest, fire, near, La, Ronge, Sask, ., Canada]","[Forest, fire, near, La, Ronge, Sask, ., Canada]"
2,5,,,All residents asked to 'shelter in place' are ...,1,"[All, residents, asked, to, 'shelter, in, plac...","[all, resid, ask, to, 'shelter, in, place, ', ...","[All, resident, asked, to, 'shelter, in, place...","[All, resident, asked, 'shelter, place, ', not..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,"[13,000, people, receive, #, wildfires, evacua...","[13,000, peopl, receiv, #, wildfir, evacu, ord...","[13,000, people, receive, #, wildfire, evacuat...","[13,000, people, receive, #, wildfire, evacuat..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,"[Just, got, sent, this, photo, from, Ruby, #, ...","[just, got, sent, thi, photo, from, rubi, #, a...","[Just, got, sent, this, photo, from, Ruby, #, ...","[Just, got, sent, photo, Ruby, #, Alaska, smok..."


In [None]:
print("hallo lior")