# NLP Machine Learning Project 2022

Useful links:

https://towardsdatascience.com/natural-language-processing-nlp-for-machine-learning-d44498845d5b
https://www.andyfitzgeraldconsulting.com/writing/keyword-extraction-nlp/

In [1]:
# pip install nltk

In [14]:
import nltk
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
# from term_frequency import term_frequencies, feature_names, df_term_frequencies

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
transformer = TfidfTransformer()
tt = TweetTokenizer()

In [15]:
# !pip install wordcloud

In [16]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [17]:
df = pd.read_csv(r'train.csv')

df.replace('NaN', np.NaN, inplace = True)

# to count the number of NaN's in each column, just change the column name in this line to see how many missing values of that
# variable per other column
# print(df[df.keyword.isnull()].count())

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [18]:
def preprocess_tweets(text):
    
# remove mentions and URLs
    text_noMentionURL = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    text_noMentionURL = " ".join(text_noMentionURL.split())
    
# remove '#' symbols and add space before capital letter
    text_noHash = re.sub(r"([A-Z]+)", r" \1", text)
    text_noHash = re.sub(r"(#)", "", text_noHash)
    text_noHash = " ".join(text_noHash.split())
    
# remove all other punctuation
    text_noNoise = "".join([char for char in text_noHash if char not in string.punctuation])

    return text_noNoise.lower()


df['tweet_noNoise'] = df["text"].apply(lambda x: preprocess_tweets(x))
df.head()

Unnamed: 0,id,keyword,location,text,target,tweet_noNoise
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


In [19]:
# lemmatizer with Part-of-Speech (POS)

from nltk.corpus import wordnet
from collections import Counter

# this function can be called within lemmatize() to tag the word with its POS
def get_pos(word):
    probable_part_of_speech = wordnet.synsets(word)
  
    pos_counts = Counter()

    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech

# tokenize tweets = divide into words
tokenized_tweets = df.apply(lambda row: tt.tokenize(row['tweet_noNoise']), axis=1)

# lemmatize the list of all tokens
df['lemmatized_list'] = tokenized_tweets.apply(lambda y: [lemmatizer.lemmatize(x, get_pos(x)) for x in y])
df['lemmatized_list'].apply(lambda x: ' '.join([tweet for tweet in x]))

# removes stop words (are, on, in, etc.) MUST COME BEFORE LIST -> STR
stop = set(stopwords.words('english'))
df['lemmatized_list'] = df['lemmatized_list'].apply(lambda x: [y for y in x if y not in stop])

# turn the resulting list back into a string
df['processed_tweet'] = df.lemmatized_list.agg(lambda x: ','.join(map(str, x)))


df.head()

Unnamed: 0,id,keyword,location,text,target,tweet_noNoise,lemmatized_list,processed_tweet
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[deed, reason, earthquake, may, allah, forgive...","deed,reason,earthquake,may,allah,forgive,u"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","forest,fire,near,la,ronge,sask,canada"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[resident, ask, shelter, place, notify, office...","resident,ask,shelter,place,notify,officer,evac..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfire, evacuation,...","13000,people,receive,wildfire,evacuation,order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[get, send, photo, ruby, alaska, smoke, wildfi...","get,send,photo,ruby,alaska,smoke,wildfire,pour..."


In [20]:
# enter whatever word you'd like to see how lemmatizing works
print(lemmatizer.lemmatize('weeds'))

weed


In [30]:
# first attempts at tf-idf
# input needs to exclude mentions & hashtags, be in lower case, remove punct, lemmatized

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(df.processed_tweet)

feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Tweet {i+1}" for i in range(len(df.processed_tweet))]

# create pandas DataFrame with tf-idf scores
# try:
df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=corpus_index)
df_tf_idf.head(20)


Unnamed: 0,Tweet 1,Tweet 2,Tweet 3,Tweet 4,Tweet 5,Tweet 6,Tweet 7,Tweet 8,Tweet 9,Tweet 10,...,Tweet 7604,Tweet 7605,Tweet 7606,Tweet 7607,Tweet 7608,Tweet 7609,Tweet 7610,Tweet 7611,Tweet 7612,Tweet 7613
0000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
001116,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0025,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
005225,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00c,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00kj,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
# IGNORE FOR RIGHT NOW
# don't think we need to remove stop words with tf-idf but keep here

stop = set(stopwords.words('english'))

# removes 'stop words' such as 'the', 'are', etc. it knows these stop words where i defined 'stop' variable, comes from a library
df['tweet_stop'] = df['lemmatized_tweet'].apply(lambda x: [y for y in x if y not in stop])

# hashtags
df['hashtag'] = df.text.apply(lambda x: re.findall(r"#(\w+)", x))

df.head()

Unnamed: 0,id,keyword,location,text,target,tweet_noNoise,lemmatized_list,processed_tweet,tweet_stop,hashtag
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[deed, reason, earthquake, may, allah, forgive...","deed,reason,earthquake,may,allah,forgive,u","[e, e, ,, r, e, n, ,, e, r, h, q, u, k, e, ,, ...",[earthquake]
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","forest,fire,near,la,ronge,sask,canada","[f, r, e, ,, f, r, e, ,, n, e, r, ,, l, ,, r, ...",[]
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[resident, ask, shelter, place, notify, office...","resident,ask,shelter,place,notify,officer,evac...","[r, e, e, n, ,, k, ,, h, e, l, e, r, ,, p, l, ...",[]
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfire, evacuation,...","13000,people,receive,wildfire,evacuation,order...","[1, 3, 0, 0, 0, ,, p, e, p, l, e, ,, r, e, c, ...",[wildfires]
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[get, send, photo, ruby, alaska, smoke, wildfi...","get,send,photo,ruby,alaska,smoke,wildfire,pour...","[g, e, ,, e, n, ,, p, h, ,, r, u, b, ,, l, k, ...","[Alaska, wildfires]"


In [12]:
# IGNORE FOR RIGHT NOW

df.tweet_stop = df.tweet_stop.apply(lambda x: [' '.join(str(y)) for y in x])
print(df.tweet_stop)

AttributeError: 'DataFrame' object has no attribute 'tweet_stop'

In [11]:
# IGNORE FOR RIGHT NOW

# this is my attempt at creating a matrix of term frequencies using tf-idf but the problem is that
# a lot of the 'words' people use are literally jibberish

from sklearn.feature_extraction.text import CountVectorizer

df.tweet_stop = df.tweet_stop.apply(lambda x: [''.join(str(y)) for y in x])
# initialize and fit CountVectorizer
vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform(df.text)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Tweet {i+1}" for i in range(len(df.tokenized_tweet))]

# create pandas DataFrame with term frequencies
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=corpus_index)

# df_term_frequencies.head(30)
print(df_term_frequencies.iloc[:])

df_term_frequencies['frequency_summation'] = df_term_frequencies.iloc[:].sum(axis=1)
print(df_term_frequencies.iloc[:])

df_term_frequencies = df_term_frequencies[df_term_frequencies['frequency_summation'] >= 2]
# df_term_frequencies = df_term_frequencies.loc[df_term_frequencies.frequency_summation >= 5]
print(df_term_frequencies.iloc[:])

AttributeError: 'DataFrame' object has no attribute 'tweet_stop'

In [None]:
# Split up the matrix
features = whatever_our_big_matrix_is_called.drop("y", 1)       # feature matrix
labels = whatever_our_big_matrix_is_called['y']               # target feature
whatever_our_big_matrix_is_called.head()

In [10]:
# DO NOT RUN THIS
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

print('These are the features and the labels:\n')
print('train features:')
print(train_features)
print('test features:')
print(test_features)
print('train labels:')
print(train_labels)
print('test labels:')
print(test_labels)

NameError: name 'features' is not defined

# Wrapper method

In [9]:
# Wrapper method 1 with k range - selma

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


sfs = SFS(RandomForestClassifier(n_jobs=-1), 
           k_features=(3, 15),
           forward=True, 
           floating=False, 
           scoring='accuracy',
           cv=5)

pipe = make_pipeline(StandardScaler(), sfs)

pipe.fit(train_features, train_labels)

print('best combination (ACC: %.3f): %s\n' % (sfs.k_score_, sfs.k_feature_idx_))
sfs.k_feature_names_

NameError: name 'X_train' is not defined

In [None]:
# Wrapper method 2 - selma
sfs = SFS(RandomForestClassifier(),
          k_features=11,
          forward=True,
          floating=False,
          scoring = 'accuracy',
          cv = 5)

sfs.fit(train_features, train_labels)
sfs.k_feature_names_     # to get the final set of features

# Filter method

In [None]:
#Remove the constants, does not run, first fix data above
#I think tweets must be preprocessed first though
constant_features = [var for var in X_train.columns if X_train[var].std() == 0] 

X_train.drop(labels=constant_features, axis=1, inplace=True)
X_test.drop(labels=constant_features, axis=1, inplace=True) 
 

X_train.shape, X_test.shape

TypeError: could not convert string to float: 'screaming'

In [63]:
#Removing Quasi-constant features,does not run

# Define the threshold as 0.01
quasi_remover = VarianceThreshold(threshold=0.01)
# Find the values with low variance
quasi_remover.fit(X_train) 
sum(quasi_remover.get_support())
# Apply to datasets
X_train = quasi_remover.transform(X_train)
X_test = quasi_remover.transform(X_test)

X_train.shape, X_test.shape

ValueError: could not convert string to float: 'forest%20fires'

In [None]:
#X_train= pd.DataFrame(X_train)
#X_train.columns = features

#X_test= pd.DataFrame(X_test)
#X_test.columns = features

In [None]:
#REMOVE DUPLICATE FEATURES
duplFeatures = []
for i in range(0, len(X_train.columns)):
    oneCol = X_train.columns[i]
for othCol in X_train.columns[i + 1:]:
    if X_train[oneCol].equals(X_train[othCol]):
            duplFeatures.append(othCol)
            
X_train.drop(labels=duplFeatures, axis=1, inplace=True)
X_test.drop(labels=duplFeatures, axis=1, inplace=True)

X_train.shape, X_test.shape

In [None]:
#REMOVE CORRELATED FEATURES
correl_Feat = set() 
correl_matrix = dataset.corr()
    
for i in range(len(corr_matrix.columns)):
   for j in range(i):
       if abs(correl_matrix.iloc[i, j]) > 0.8:
       colName = correl_matrix.columns[i]  
       correl_Feat.add(colname)
        
X_train.drop(labels=correl_Feat, axis=1, inplace=True)
X_test.drop(labels=correl_Feat, axis=1, inplace=True)

In [None]:
#after each filter or removing of features,
#a copy can be made and stored to later compare performance. We can check performance of original list,
#vs after removing correlated vs after removing quasi constants.

# Classifier

In [None]:
# RandomForest - train the model and test, report score
from sklearn.ensemble import RandomForestClassifier
print('\nThis is RandomForest score:')
model = RandomForestClassifier()
model.fit(train_features, train_labels)
print(model.score(test_features, test_labels))

In [None]:
# Linear Regression - train the model and test, report score
from sklearn.linear_model import LinearRegression
print('\nThis is Linear Regression score:')
model2 = LinearRegression()
model2.fit(train_features, train_labels)
model2.score(test_features, test_labels)