# NLP Machine Learning Project 2022

Useful links:

https://towardsdatascience.com/natural-language-processing-nlp-for-machine-learning-d44498845d5b
https://www.andyfitzgeraldconsulting.com/writing/keyword-extraction-nlp/

In [1]:
# pip install nltk

In [24]:
import nltk
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
# from term_frequency import term_frequencies, feature_names, df_term_frequencies

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
transformer = TfidfTransformer()
tt = TweetTokenizer()

In [25]:
# !pip install wordcloud

In [26]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [27]:
df = pd.read_csv(r'train.csv')

df.replace('NaN', np.NaN, inplace = True)

# to count the number of NaN's in each column, just change the column name in this line to see how many missing values of that
# variable per other column
# print(df[df.keyword.isnull()].count())

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [28]:
def preprocess_tweets(text):
    
# remove mentions and URLs
    text_noMentionURL = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    text_noMentionURL = " ".join(text_noMentionURL.split())
    
# remove '#' symbols and add space before capital letter
    text_noHash = re.sub(r"([A-Z]+)", r" \1", text)
    text_noHash = re.sub(r"(#)", "", text_noHash)
    text_noHash = " ".join(text_noHash.split())
    
# remove all other punctuation
    text_noNoise = "".join([char for char in text_noHash if char not in string.punctuation])

    return text_noNoise.lower()


df['tweet_noNoise'] = df["text"].apply(lambda x: preprocess_tweets(x))
df.head()

Unnamed: 0,id,keyword,location,text,target,tweet_noNoise
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


In [29]:
# lemmatizer with Part-of-Speech (POS)

from nltk.corpus import wordnet
from collections import Counter

# this function can be called within lemmatize() to tag the word with its POS
def get_pos(word):
    probable_part_of_speech = wordnet.synsets(word)
  
    pos_counts = Counter()

    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech

# tokenize tweets = divide into words
tokenized_tweets = df.apply(lambda row: tt.tokenize(row['tweet_noNoise']), axis=1)

# lemmatize the list of all tokens
df['lemmatized_list'] = tokenized_tweets.apply(lambda y: [lemmatizer.lemmatize(x, get_pos(x)) for x in y])
df['lemmatized_list'].apply(lambda x: ' '.join([tweet for tweet in x]))

# removes stop words (are, on, in, etc.) MUST COME BEFORE LIST -> STR
stop = set(stopwords.words('english'))
df['lemmatized_list'] = df['lemmatized_list'].apply(lambda x: [y for y in x if y not in stop])

# turn the resulting list back into a string
df['processed_tweet'] = df.lemmatized_list.agg(lambda x: ','.join(map(str, x)))

# remove weird characters and 1 character after
df['processed_tweet'] = df['processed_tweet'].replace({'û.':''}, regex=True)
df['processed_tweet'] = df['processed_tweet'].replace({'ì.':''}, regex=True)
df['processed_tweet'] = df['processed_tweet'].replace({'å.':''}, regex=True)
df['processed_tweet'] = df['processed_tweet'].replace({'â.':''}, regex=True)

df.head()

Unnamed: 0,id,keyword,location,text,target,tweet_noNoise,lemmatized_list,processed_tweet
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[deed, reason, earthquake, may, allah, forgive...","deed,reason,earthquake,may,allah,forgive,u"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","forest,fire,near,la,ronge,sask,canada"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[resident, ask, shelter, place, notify, office...","resident,ask,shelter,place,notify,officer,evac..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,13000 people receive wildfires evacuation orde...,"[13000, people, receive, wildfire, evacuation,...","13000,people,receive,wildfire,evacuation,order..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[get, send, photo, ruby, alaska, smoke, wildfi...","get,send,photo,ruby,alaska,smoke,wildfire,pour..."


In [30]:
# enter whatever word you'd like to see how lemmatizing works
print(lemmatizer.lemmatize('weeds'))

weed


In [31]:
# first attempts at tf-idf
# input needs to exclude mentions & hashtags, be in lower case, remove punct, lemmatized

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(df.processed_tweet)

feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Tweet {i+1}" for i in range(len(df.processed_tweet))]

# create pandas DataFrame with tf-idf scores
df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=corpus_index)
# df_tf_idf.reset_index()

df_tf_idf = df_tf_idf.T.reset_index()

print(df_tf_idf)



         level_0  0000  001  0011  001116  0025  005225  007  00c  00kj  ...  \
0        Tweet 1   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   
1        Tweet 2   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   
2        Tweet 3   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   
3        Tweet 4   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   
4        Tweet 5   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   
...          ...   ...  ...   ...     ...   ...     ...  ...  ...   ...  ...   
7608  Tweet 7609   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   
7609  Tweet 7610   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   
7610  Tweet 7611   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   
7611  Tweet 7612   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   
7612  Tweet 7613   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  ...   

      zz4  zzaes  zzcb  zzi  zzk  zztb 

In [32]:
# put targets into their own dataframe so i can merge them with tf-idf scores
target_df = pd.DataFrame()
target_df['target'] = df.target
print(target_df)

      target
0          1
1          1
2          1
3          1
4          1
...      ...
7608       1
7609       1
7610       1
7611       1
7612       1

[7613 rows x 1 columns]


In [33]:
# IGNORE FOR RIGHT NOW
# don't think we need to remove stop words with tf-idf but keep here

#stop = set(stopwords.words('english'))

# removes 'stop words' such as 'the', 'are', etc. it knows these stop words where i defined 'stop' variable, comes from a library
#df['tweet_stop'] = df['lemmatized_tweet'].apply(lambda x: [y for y in x if y not in stop])

# hashtags
#df['hashtag'] = df.text.apply(lambda x: re.findall(r"#(\w+)", x))

#df.head()

In [34]:
# IGNORE FOR RIGHT NOW

#df.tweet_stop = df.tweet_stop.apply(lambda x: [' '.join(str(y)) for y in x])
#print(df.tweet_stop)

In [45]:
# IGNORE FOR RIGHT NOW - THIS IS THE COUNT VECTORIZER FOR IF WE WANT TO REMOVE KEYBOARD SMASHES LATER

from sklearn.feature_extraction.text import CountVectorizer

# remove terms that appear only once

vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform(df.processed_tweet)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()
corpus_index = [f"Tweet {i+1}" for i in range(len(df.processed_tweet))]

# create pandas DataFrame with term frequencies
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=corpus_index)

df_term_frequencies['frequency_summation'] = df_term_frequencies.iloc[:].sum(axis=1)
# df_term_frequencies = df_term_frequencies[df_term_frequencies['frequency_summation'] >= 2]
print(df_term_frequencies.iloc[:])

        Tweet 1  Tweet 2  Tweet 3  Tweet 4  Tweet 5  Tweet 6  Tweet 7  \
0000          0        0        0        0        0        0        0   
001           0        0        0        0        0        0        0   
0011          0        0        0        0        0        0        0   
001116        0        0        0        0        0        0        0   
0025          0        0        0        0        0        0        0   
...         ...      ...      ...      ...      ...      ...      ...   
zztb          0        0        0        0        0        0        0   
zzxgh         0        0        0        0        0        0        0   
zzxt          0        0        0        0        0        0        0   
zzzg          0        0        0        0        0        0        0   
zzzz          0        0        0        0        0        0        0   

        Tweet 8  Tweet 9  Tweet 10  ...  Tweet 7605  Tweet 7606  Tweet 7607  \
0000          0        0         0  ...     

In [35]:
# Split up the matrix
df_tf_idf.drop(df_tf_idf.columns[0], axis=1, inplace=True)
features = df_tf_idf    # feature matrix
labels = target_df['target']              # target feature
df_tf_idf.head()

Unnamed: 0,0000,001,0011,001116,0025,005225,007,00c,00kj,01,...,zz4,zzaes,zzcb,zzi,zzk,zztb,zzxgh,zzxt,zzzg,zzzz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [36]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

print('These are the features and the labels:\n')
print('train features:')
print(train_features)
print('test features:')
print(test_features)
print('train labels:')
print(train_labels)
print('test labels:')
print(test_labels)

These are the features and the labels:

train features:
      0000  001  0011  001116  0025  005225  007  00c  00kj   01  ...  zz4  \
5151   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   
6351   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   
3443   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   
7164   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   
7037   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   
...    ...  ...   ...     ...   ...     ...  ...  ...   ...  ...  ...  ...   
5226   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   
5390   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   
860    0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   
7603   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   
7270   0.0  0.0   0.0     0.0   0.0     0.0  0.0  0.0   0.0  0.0  ...  0.0   

      z

# Wrapper method

In [None]:
# Wrapper method 1 with k range - selma

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


sfs = SFS(RandomForestClassifier(), 
           k_features=(3, 15),
           forward=True, 
           floating=False, 
           scoring='accuracy',
           cv=5)

pipe = make_pipeline(StandardScaler(), sfs)

pipe.fit(train_features, train_labels)

print('best combination (ACC: %.3f): %s\n' % (sfs.k_score_, sfs.k_feature_idx_))
sfs.k_feature_names_

In [75]:
# Wrapper method 2 - selma
sfs = SFS(RandomForestClassifier(),
          k_features=2828,
          forward=True,
          floating=False,
          scoring = 'accuracy',
          cv = 0)

sfs.fit(train_features, train_labels)

#print('best combination (ACC: %.3f): %s\n' % (sfs.k_score_, sfs.k_feature_idx_))
print('best combination (ACC: %.3f)' % (sfs.k_score_))
#sfs.k_feature_names_     # to get the final set of features

best combination (ACC: 0.000)



STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

# Filter method

In [None]:
#Remove the constants, does not run, first fix data above
#I think tweets must be preprocessed first though
constant_features = [var for var in train_features.columns if train_features[var].std() == 0] 

no_constant_train = train_features.drop(labels=constant_features, axis=1, inplace=True)
no_constant_test = test_features.drop(labels=constant_features, axis=1, inplace=True) 
 

print(no_constant_train)
#no_constant_train.shape

In [40]:
#Removing Quasi-constant features,does not run
from sklearn.feature_selection import VarianceThreshold, SelectKBest

# Define the threshold as 0.01
quasi_remover = VarianceThreshold(threshold=0.01)
# Find the values with low variance
quasi_remover.fit(no_constant_train) 
sum(quasi_remover.get_support())
# Apply to datasets
no_quasi_train = quasi_remover.transform(no_constant_train)
no_quasi_test = quasi_remover.transform(no_constant_test)

no_quasi_train.shape, no_quasi_test.shape

ValueError: Expected 2D array, got scalar array instead:
array=nan.
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [46]:
#REMOVE DUPLICATE FEATURES
duplFeatures = []
for i in range(0, len(train_features.columns)):
    oneCol = train_features.columns[i]
for othCol in train_features.columns[i + 1:]:
    if train_features[oneCol].equals(train_features[othCol]):
            duplFeatures.append(othCol)
            
no_dupl = train_features.drop(labels=duplFeatures, axis=1, inplace=True)
#X_test.drop(labels=duplFeatures, axis=1, inplace=True)

print(no_dupl)
no_dupl.shape #, X_test.shape

None


AttributeError: 'NoneType' object has no attribute 'shape'

In [None]:
#REMOVE CORRELATED FEATURES
correl_Feat = set() 
correl_matrix = train_features.corr()
    
for i in range(len(corr_matrix.columns)):
   for j in range(i):
       if abs(correl_matrix.iloc[i, j]) > 0.8:
           colName = correl_matrix.columns[i]  
           correl_Feat.add(colname)
            
no_correl = train_features.drop(labels=correl_Feat, axis=1, inplace=True)

print(no_correl)
no_correl.shape

In [None]:
#after each filter or removing of features,
#a copy can be made and stored to later compare performance. We can check performance of original list,
#vs after removing correlated vs after removing quasi constants.

# Classifier

In [None]:
# FOR WRAPPER METHOD - GENERATE THE NEW TRAIN AND TEST DATAFRAMES BASED ON SELECTED FEATURES

# Note that the transform call is equivalent to
# features_train[:, sfs.k_feature_idx_]

features_train_sfs = sfs.transform(X_train)
features_test_sfs = sfs.transform(X_test)
print(features_train_sfs)
print(features_test_sfs)

# Fit the estimator using the new feature subset
# and make a prediction on the test data
model = RandomForestClassifier()
model.fit(features_train_sfs, labels_train)
labels_pred = model.predict(features_test_sfs)

# Compute the accuracy of the prediction
acc = float((labels_test == labels_pred).sum()) / labels_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc * 100))

In [None]:
# RandomForest - train the model and test, report score
from sklearn.ensemble import RandomForestClassifier
print('\nThis is RandomForest score:')
model = RandomForestClassifier()
model.fit(train_features, train_labels)
print(model.score(test_features, test_labels))

In [None]:
# Linear Regression - train the model and test, report score
from sklearn.linear_model import LinearRegression
print('\nThis is Linear Regression score:')
model2 = LinearRegression()
model2.fit(train_features, train_labels)
model2.score(test_features, test_labels)