# NLP Machine Learning Project 2022

Useful links:

https://towardsdatascience.com/natural-language-processing-nlp-for-machine-learning-d44498845d5b
https://www.andyfitzgeraldconsulting.com/writing/keyword-extraction-nlp/

In [None]:
#pip install nltk

In [None]:
#pip install pyspellchecker

In [1]:
import nltk
import pandas as pd
import numpy as np
import string
from nltk.tokenize import word_tokenize, TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfTransformer
import re
from nltk.probability import FreqDist
import matplotlib.pyplot as plt
# from term_frequency import term_frequencies, feature_names, df_term_frequencies

stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
transformer = TfidfTransformer()
tt = TweetTokenizer()

In [2]:
# !pip install wordcloud

In [3]:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('stopwords')

In [4]:
df = pd.read_csv(r'train.csv')

df.replace('NaN', np.NaN, inplace = True)

# to count the number of NaN's in each column, just change the column name in this line to see how many missing values of that
# variable per other column
# print(df[df.keyword.isnull()].count())

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
def preprocess_tweets(text):
    
# remove mentions and URLs
    text_noMentionURL = re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", text)
    text_noMentionURL = " ".join(text_noMentionURL.split())
    
# remove '#' symbols and add space before capital letter
    text_noHash = re.sub(r"([A-Z]+)", r" \1", text_noMentionURL)
    text_noHash = re.sub(r"(#)", "", text_noHash)
    text_noHash = " ".join(text_noHash.split())
    
# remove numbers
    text_noHash = re.sub(r"[0-9]+", "", text_noHash)
    
# remove all other punctuation
    text_noNoise = "".join([char for char in text_noHash if char not in string.punctuation])

    return text_noNoise.lower()


df['tweet_noNoise'] = df["text"].apply(lambda x: preprocess_tweets(x))


def no_smash(text):
    
    no_smash = re.sub(r"(?:(?![aeiou])[a-z]){5}", r"", text)
    no_smash = " ".join(no_smash.split())
    
    return no_smash
    
df['tweet_noNoise'] = df["tweet_noNoise"].apply(lambda x: no_smash(x))

df.head()

Unnamed: 0,id,keyword,location,text,target,tweet_noNoise
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...


In [6]:
# lemmatizer with Part-of-Speech (POS)

from nltk.corpus import wordnet
from collections import Counter
from spellchecker import SpellChecker

# this function can be called within lemmatize() to tag the word with its POS
def get_pos(word):
    probable_part_of_speech = wordnet.synsets(word)
  
    pos_counts = Counter()

    pos_counts["n"] = len(  [ item for item in probable_part_of_speech if item.pos()=="n"]  )
    pos_counts["v"] = len(  [ item for item in probable_part_of_speech if item.pos()=="v"]  )
    pos_counts["a"] = len(  [ item for item in probable_part_of_speech if item.pos()=="a"]  )
    pos_counts["r"] = len(  [ item for item in probable_part_of_speech if item.pos()=="r"]  )
  
    most_likely_part_of_speech = pos_counts.most_common(1)[0][0]
    return most_likely_part_of_speech

def reduce_lengthening(text):
    pattern = re.compile(r"(.)\1{2,}")
    return pattern.sub(r"\1\1", text)

# tokenize tweets = divide into words
tokenized_tweets = df.apply(lambda row: tt.tokenize(row['tweet_noNoise']), axis=1)

# lemmatize the list of all tokens
df['lemmatized_list'] = tokenized_tweets.apply(lambda y: [lemmatizer.lemmatize(x, get_pos(x)) for x in y])
df['lemmatized_list'].apply(lambda x: ' '.join([tweet for tweet in x]))

# removes stop words (are, on, in, etc.) MUST COME BEFORE LIST -> STR
stop = set(stopwords.words('english'))
df['lemmatized_list'] = df['lemmatized_list'].apply(lambda x: [y for y in x if y not in stop])

# turn the resulting list back into a string
df['old_processed_tweet'] = df.lemmatized_list.agg(lambda x: ','.join(map(str, x)))

# remove weird characters and 1 character after
df['old_processed_tweet'] = df['old_processed_tweet'].replace({'û.':''}, regex=True)
df['old_processed_tweet'] = df['old_processed_tweet'].replace({'ì.':''}, regex=True)
df['old_processed_tweet'] = df['old_processed_tweet'].replace({'å.':''}, regex=True)
df['old_processed_tweet'] = df['old_processed_tweet'].replace({'â.':''}, regex=True)

df['old_processed_tweet'] = df['old_processed_tweet'].apply(lambda x: [reduce_lengthening(x)])
df['processed_tweet'] = df.old_processed_tweet.agg(lambda x: ','.join(map(str, x)))

df.head()

Unnamed: 0,id,keyword,location,text,target,tweet_noNoise,lemmatized_list,old_processed_tweet,processed_tweet
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake ma...,"[deed, reason, earthquake, may, allah, forgive...","[deed,reason,earthquake,may,allah,forgive,u]","deed,reason,earthquake,may,allah,forgive,u"
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,"[forest, fire, near, la, ronge, sask, canada]","[forest,fire,near,la,ronge,sask,canada]","forest,fire,near,la,ronge,sask,canada"
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are be...,"[resident, ask, shelter, place, notify, office...","[resident,ask,shelter,place,notify,officer,eva...","resident,ask,shelter,place,notify,officer,evac..."
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in ...,"[people, receive, wildfire, evacuation, order,...","[people,receive,wildfire,evacuation,order,cali...","people,receive,wildfire,evacuation,order,calif..."
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as s...,"[get, send, photo, ruby, alaska, smoke, wildfi...","[get,send,photo,ruby,alaska,smoke,wildfire,pou...","get,send,photo,ruby,alaska,smoke,wildfire,pour..."


In [8]:
from spellchecker import SpellChecker
spell = SpellChecker()
def spell_check(text):
    
    text_noMiss = re.sub(text, spell.correction(text), text)
    text_noMiss = " ".join(text_noMiss.split())
    
    return text_noMiss
testing = df.loc[500:550]   
testing.processed_tweet = testing.processed_tweet.apply(lambda txt: spell_check(txt))
testing.head(50)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


Unnamed: 0,id,keyword,location,text,target,tweet_noNoise,lemmatized_list,old_processed_tweet,processed_tweet
500,725,attacked,"LEALMAN, FLORIDA",Christian Attacked by Muslims at the Temple Mo...,1,christian attacked by muslims at the temple mo...,"[christian, attack, muslim, temple, mount, wav...","[christian,attack,muslim,temple,mount,wave,isr...","christian,attack,muslim,temple,mount,wave,isra..."
501,726,attacked,"Los Angeles, CA",@envw98 @NickCoCoFree @JulieDiCaro @jdabe80 Wh...,0,why am i the worst person questioning how juli...,"[worst, person, question, julie, attack, guy, ...","[worst,person,question,julie,attack,guy,empathy]","worst,person,question,julie,attack,guy,empathy"
502,727,attacked,"San Francisco, CA",Kelly Osbourne attacked for racist Donald Trum...,1,kelly osbourne attacked for racist donald trum...,"[kelly, osbourne, attack, racist, donald, trum...","[kelly,osbourne,attack,racist,donald,trump,rem...","kelly,osbourne,attack,racist,donald,trump,rema..."
503,728,attacked,#GDJB #ASOT,@eunice_njoki aiii she needs to chill and answ...,0,aiii she needs to chill and answer calmly its ...,"[aiii, need, chill, answer, calmly, like, shes...","[aii,need,chill,answer,calmly,like,shes,attack]","aii,need,chill,answer,calmly,like,shes,attack"
504,729,attacked,"Groningen, Netherlands, Europe",Christian Attacked by Muslims at the Temple Mo...,1,christian attacked by muslims at the temple mo...,"[christian, attack, muslim, temple, mount, wav...","[christian,attack,muslim,temple,mount,wave,isr...","christian,attack,muslim,temple,mount,wave,isra..."
505,730,attacked,"Livingston, IL U.S.A.",Christian Attacked by Muslims at the Temple Mo...,1,christian attacked by muslims at the temple mo...,"[christian, attack, muslim, temple, mount, wav...","[christian,attack,muslim,temple,mount,wave,isr...","christian,attack,muslim,temple,mount,wave,isra..."
506,731,attacked,Arundel,Christian Attacked by Muslims at the Temple Mo...,1,christian attacked by muslims at the temple mo...,"[christian, attack, muslim, temple, mount, wav...","[christian,attack,muslim,temple,mount,wave,isr...","christian,attack,muslim,temple,mount,wave,isra..."
507,732,attacked,,I attacked Robot-lvl 19 and I've earned a tota...,0,i attacked robotlvl and ive earned a total of ...,"[attack, robotlvl, ive, earn, total, free, sat...","[attack,robotlvl,ive,earn,total,free,satoshis,...","attack,robotlvl,ive,earn,total,free,satoshis,r..."
508,734,attacked,America,Christian Attacked by Muslims at the Temple Mo...,1,christian attacked by muslims at the temple mo...,"[christian, attack, muslim, temple, mount, wav...","[christian,attack,muslim,temple,mount,wave,isr...","christian,attack,muslim,temple,mount,wave,isra..."
509,735,attacked,"Anna Maria, FL",@christinalavv @lindsay_wynn3 I just saw these...,0,i just saw these tweets and i feel really atta...,"[saw, tweet, feel, really, attack]","[saw,tweet,feel,really,attack]","saw,tweet,feel,really,attack"


In [10]:
# enter whatever word you'd like to see how lemmatizing works
print(lemmatizer.lemmatize('weeds'))

weed


In [11]:
# first attempts at tf-idf
# input needs to exclude mentions & hashtags, be in lower case, remove punct, lemmatized

from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(norm=None)
tfidf_scores = vectorizer.fit_transform(df.processed_tweet)

feature_names = vectorizer.get_feature_names()

# get corpus index
corpus_index = [f"Tweet {i+1}" for i in range(len(df.processed_tweet))]

# create pandas DataFrame with tf-idf scores
df_tf_idf = pd.DataFrame(tfidf_scores.T.todense(), index=feature_names, columns=corpus_index)
# df_tf_idf.reset_index()

df_tf_idf = df_tf_idf.T.reset_index()

print(df_tf_idf)



         level_0   aa  aahh  aal  aall  aamp  aan  aand  aar  aashiqui  ...  \
0        Tweet 1  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   
1        Tweet 2  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   
2        Tweet 3  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   
3        Tweet 4  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   
4        Tweet 5  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   
...          ...  ...   ...  ...   ...   ...  ...   ...  ...       ...  ...   
7608  Tweet 7609  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   
7609  Tweet 7610  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   
7610  Tweet 7611  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   
7611  Tweet 7612  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   
7612  Tweet 7613  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  ...   

      zonesthank  zonewolf  zoom  zotar  zouma  zqp

In [12]:
# put targets into their own dataframe so i can merge them with tf-idf scores
target_df = pd.DataFrame()
target_df['target'] = df.target
print(target_df)

      target
0          1
1          1
2          1
3          1
4          1
...      ...
7608       1
7609       1
7610       1
7611       1
7612       1

[7613 rows x 1 columns]


In [13]:
# IGNORE FOR RIGHT NOW
# don't think we need to remove stop words with tf-idf but keep here

#stop = set(stopwords.words('english'))

# removes 'stop words' such as 'the', 'are', etc. it knows these stop words where i defined 'stop' variable, comes from a library
#df['tweet_stop'] = df['lemmatized_tweet'].apply(lambda x: [y for y in x if y not in stop])

# hashtags
#df['hashtag'] = df.text.apply(lambda x: re.findall(r"#(\w+)", x))

#df.head()

In [14]:
# IGNORE FOR RIGHT NOW

#df.tweet_stop = df.tweet_stop.apply(lambda x: [' '.join(str(y)) for y in x])
#print(df.tweet_stop)

In [15]:
# IGNORE FOR RIGHT NOW - THIS IS THE COUNT VECTORIZER FOR IF WE WANT TO REMOVE KEYBOARD SMASHES LATER

from sklearn.feature_extraction.text import CountVectorizer

# remove terms that appear only once

vectorizer = CountVectorizer()
term_frequencies = vectorizer.fit_transform(df.processed_tweet)

# get vocabulary of terms
feature_names = vectorizer.get_feature_names()
corpus_index = [f"Tweet {i+1}" for i in range(len(df.processed_tweet))]

# create pandas DataFrame with term frequencies
df_term_frequencies = pd.DataFrame(term_frequencies.T.todense(), index=feature_names, columns=corpus_index)

df_term_frequencies['frequency_summation'] = df_term_frequencies.iloc[:].sum(axis=1)
# df_term_frequencies = df_term_frequencies[df_term_frequencies['frequency_summation'] >= 2]
print(df_term_frequencies.iloc[:])



        Tweet 1  Tweet 2  Tweet 3  Tweet 4  Tweet 5  Tweet 6  Tweet 7  \
aa            0        0        0        0        0        0        0   
aahh          0        0        0        0        0        0        0   
aal           0        0        0        0        0        0        0   
aall          0        0        0        0        0        0        0   
aamp          0        0        0        0        0        0        0   
...         ...      ...      ...      ...      ...      ...      ...   
zqp           0        0        0        0        0        0        0   
zr            0        0        0        0        0        0        0   
zumiez        0        0        0        0        0        0        0   
zurich        0        0        0        0        0        0        0   
zz            0        0        0        0        0        0        0   

        Tweet 8  Tweet 9  Tweet 10  ...  Tweet 7605  Tweet 7606  Tweet 7607  \
aa            0        0         0  ...     

In [13]:
# Split up the matrix
df_tf_idf.drop(df_tf_idf.columns[0], axis=1, inplace=True)
features = df_tf_idf                      # feature matrix
labels = target_df['target']              # target feature
features.head()

Unnamed: 0,aa,aahh,aal,aall,aamp,aan,aand,aar,aashiqui,aat,...,zonesthank,zonewolf,zoom,zotar,zouma,zqp,zr,zumiez,zurich,zz
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
# Using Skicit-learn to split data into training and testing sets
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.25, random_state = 42)

print('These are the features and the labels:\n')
print('train features:')
print(train_features)
print('test features:')
print(test_features)
print('train labels:')
print(train_labels)
print('test labels:')
print(test_labels)

#only to see if filters later work
train_features.shape
#test_features.shape

These are the features and the labels:

train features:
       aa  aahh  aal  aall  aamp  aan  aand  aar  aashiqui  aat  ...  \
5151  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   
6351  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   
3443  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   
7164  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   
7037  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   
...   ...   ...  ...   ...   ...  ...   ...  ...       ...  ...  ...   
5226  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   
5390  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   
860   0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   
7603  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   
7270  0.0   0.0  0.0   0.0   0.0  0.0   0.0  0.0       0.0  0.0  ...   

      zonesthank  zonewolf  zoom  zotar  zouma  zqp   zr  zumiez  zurich   zz  

(5709, 11573)

# Wrapper method

In [None]:
# Wrapper method 1 with k range - selma

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline


sfs = SFS(RandomForestClassifier(), 
           k_features=(3, 15),
           forward=True, 
           floating=False, 
           scoring='accuracy',
           cv=5)

pipe = make_pipeline(StandardScaler(), sfs)

pipe.fit(train_features, train_labels)

print('best combination (ACC: %.3f): %s\n' % (sfs.k_score_, sfs.k_feature_idx_))
sfs.k_feature_names_

In [None]:
# Wrapper method 2 - selma
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
sfs = SFS(RandomForestClassifier(),
          k_features=100,
          forward=True,
          floating=False,
          scoring = 'accuracy',
          cv = 0)

sfs.fit(train_features, train_labels)

#print('best combination (ACC: %.3f): %s\n' % (sfs.k_score_, sfs.k_feature_idx_))
print('best combination (ACC: %.3f)' % (sfs.k_score_))
#sfs.k_feature_names_     # to get the final set of features

# Filter method

In [54]:
#Removing constant features
from sklearn.feature_selection import VarianceThreshold

#Create constant filter and apply to training features
constant_filter = VarianceThreshold(threshold=0)
constant_filter.fit(train_features)

len(train_features.columns[constant_filter.get_support()])

constant_columns = [column for column in train_features.columns
                    if column not in train_features.columns[constant_filter.get_support()]]
print(constant_columns)

train_features.drop(labels=constant_columns, axis=1, inplace=True)
test_features.drop(labels=constant_columns, axis=1, inplace=True)

train_features.shape

['aand', 'aashiqui', 'aayf', 'abes', 'abeyth', 'aboard', 'absence', 'accidentalprophecy', 'accidentwho', 'accuracy', 'acquiesce', 'actavis', 'actin', 'acura', 'adamantly', 'addiction', 'adjuster', 'adoptive', 'aeg', 'affic', 'afloat', 'aggarwal', 'agnivesh', 'agt', 'aguero', 'ahahahga', 'ahamedis', 'ahrar', 'aii', 'airbullet', 'airhead', 'airhorns', 'airlift', 'akilah', 'akrams', 'akwa', 'alaskaseafood', 'albertsons', 'alchemist', 'alexandrian', 'alexis', 'ali', 'alice', 'allied', 'alloosh', 'almighty', 'alois', 'alternate', 'aluminum', 'alwx', 'ambition', 'ambleside', 'ameribag', 'amicos', 'amicospizzato', 'amiddleaged', 'ampamp', 'ampask', 'ampstart', 'ampwanted', 'amreading', 'amritsar', 'anakin', 'anders', 'ani', 'animation', 'anime', 'annonymous', 'anonymous', 'anthology', 'antifeminist', 'antiochus', 'anu', 'anxietyproblems', 'anytime', 'apiece', 'appropriate', 'appropriation', 'april', 'archetype', 'areal', 'aredeluged', 'argsuppose', 'arian', 'ariana', 'ariz', 'arra', 'arrange'

(5709, 9832)

In [55]:
#Removing Quasi-constant features
# Define the threshold as 0.01 and create the quasi constant filter
q_constant_remover = VarianceThreshold(threshold=0.01)

#Apply filter to the training set
q_constant_remover.fit(train_features) 

#See that the training set now only contains non-constant and non-quasi features
train_features = q_constant_remover.transform(train_features)
test_features = q_constant_remover.transform(test_features)

train_features.shape

(5709, 9832)

In [20]:
#Removing duplicates has been removed as it already should be done with tf-idf

In [56]:
#REMOVE CORRELATED FEATURES
train_features= pd.DataFrame(train_features)
test_features= pd.DataFrame(test_features)

correlated_features = set()
correlation_matrix = train_features.corr()
    
for i in range(len(correlation_matrix .columns)):
    for j in range(i):
        if abs(correlation_matrix.iloc[i, j]) > 0.9:
            colname = correlation_matrix.columns[i]
            correlated_features.add(colname)
            
train_features.drop(labels=correlated_features, axis=1, inplace=True)
test_features.drop(labels=correlated_features, axis=1, inplace=True)

train_features.shape

(5709, 6998)

# Classifier

In [None]:
# FOR WRAPPER METHOD - GENERATE THE NEW TRAIN AND TEST DATAFRAMES BASED ON SELECTED FEATURES

# Note that the transform call is equivalent to
# features_train[:, sfs.k_feature_idx_]

features_train_sfs = sfs.transform(train_features)
features_test_sfs = sfs.transform(test_features)
print(features_train_sfs)
print(features_test_sfs)

# Fit the estimator using the new feature subset
# and make a prediction on the test data
model = RandomForestClassifier()
model.fit(features_train_sfs, train_labels)
labels_pred = model.predict(features_test_sfs)

# Compute the accuracy of the prediction
acc = float((test_labels == labels_pred).sum()) / labels_pred.shape[0]
print('Test set accuracy: %.2f %%' % (acc * 100))

In [57]:
# RandomForest - train the model and test, report score
from sklearn.ensemble import RandomForestClassifier
print('\nThis is RandomForest score:')
model = RandomForestClassifier()
model.fit(train_features, train_labels)
print(model.score(test_features, test_labels))


This is RandomForest score:
0.7841386554621849


In [None]:
# Logistic Regression - train the model and test, report score
from sklearn.linear_model import LogisticRegression
print('\nThis is Logistic Regression score:')
model2 = LogisticRegression()
model2.fit(train_features, train_labels)
model2.score(test_features, test_labels)