In [398]:
import pandas as pd
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
import string
from spellchecker import SpellChecker
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import nltk
from sklearn.linear_model import LogisticRegression
import featuretools as ft
from featuretools.nlp_primitives import DiversityScore, LSA, MeanCharactersPerWord, PartOfSpeechCount, PolarityScore, PunctuationCount, StopwordCount, TitleWordCount, UniversalSentenceEncoder, UpperCaseCount
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

nltk.download('punkt')
stemmer = SnowballStemmer('english')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/steven.glembocki/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [399]:
def word_tokenize(text, how = 'lemma'):
    words = TextBlob(text).words
    if how == 'lemma':
        return [word.lemmatize() for word in words]
    elif how == 'stem':
        return [stemmer.stem(word) for word in words]
    
def remove_url(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text).strip()

def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'', text)

def remove_emoji(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'', text)

def remove_punct(text):
    table=str.maketrans('', '', string.punctuation)
    return text.translate(table).strip()

spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)

In [400]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample = pd.read_csv('sample_submission.csv')

In [401]:
sample.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [402]:
submission = pd.DataFrame(test['id'])
submission['target'] = 0
submission.to_csv('submission.csv', index = False)
submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [403]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [404]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
id          7613 non-null int64
keyword     7552 non-null object
location    5080 non-null object
text        7613 non-null object
target      7613 non-null int64
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [405]:
train['text'] = train['text'].str.replace(' AHHHH ', ' ahh ')
train['text'] = train['text'].str.replace(' AHHhhh ', ' ahh ')
train['text'] = train['text'].str.replace('AYHHHHHDJJFJRJJRDJJEKS', '')
train['text'] = train['text'].str.replace(' Aaaaaand ', ' and ')
train['text'] = train['text'].str.replace('  ', ' ')
train['text'] = train['text'].str.replace(' bc ', ' because ')
train['text'] = train['text'].str.replace('#RT ', '')
train['text'] = train['text'].str.replace('RT ', '')
train['text'] = train['text'].str.replace('&amp', '')
train['text'] = train['text'].str.replace('\x89Û', '')
train['text'] = train['text'].str.replace('\x9d', '')
train['text'] = train['text'].str.replace('\x89ã¢', '')
train['text'] = train['text'].str.replace('\x89âÂ', '')
train['text'] = train['text'].str.replace('÷', '')
train['text'] = train['text'].str.replace('Ï', '')
train['text'] = train['text'].str.replace('ª', '')
train['text'] = train['text'].str.replace('åÊ', ' ')
train['text'] = train['text'].str.replace('¢', '')
train['text'] = train['text'].str.replace(' ur ', ' your ')
train['text'] = train['text'].str.replace(' u ', ' you ')
train['text'] = train['text'].str.replace('Ì', 'i')
train['text'] = train['text'].str.replace('inbetween', 'in between')
train['text'] = train['text'].str.replace('Damnnnn', 'damn')
train['text'] = train['text'].str.replace(' hwy ', ' highway ')
train['text'] = train['text'].str.replace(' Hwy ', ' highway ')
train['text'] = train['text'].str.replace('LOLOL', 'lol')
train['text'] = train['text'].str.replace('LOOOOOOL', 'lol')
train['text'] = train['text'].str.replace('LOOOOOOOOOOOOL', 'lol')
train['text'] = train['text'].str.replace('yourboyshawn', 'your boy shawn')
train['text'] = train['text'].str.replace('WorldNews', 'world news')
train['text'] = train['text'].str.replace('Worldnews', 'world news')
train['text'] = train['text'].str.replace('WhiteHouse', 'White House')
train['text'] = train['text'].str.replace('Whitehouse', 'White House')
train['text'] = train['text'].str.replace('WorstSummerJob', 'worst summer job')
train['text'] = train['text'].str.replace('wrongperson', 'wrong person')
train['text'] = train['text'].str.replace('wrongway', 'wrong way')
train['text'] = train['text'].str.replace('volcanotornado', 'volcano tornado')
train['text'] = train['text'].apply(lambda x: remove_url(x))
train['text'] = train['text'].apply(lambda x: remove_html(x))
train['text'] = train['text'].apply(lambda x: remove_emoji(x))
train['text'] = train['text'].apply(lambda x: remove_punct(x))
train['text'] = train['text'].apply(lambda x: re.sub(r'[^a-zA-Z\']', ' ', x)) #
train['text'] = train['text'].apply(lambda x: re.sub(r'[^\x00-\x7F]+', ' ', x)) #
train['text'] = train['text'].apply(lambda x: ''.join([i for i in x if not i.isdigit()]))
train['text'] = train['text'].apply(lambda x: re.sub(r'\bt.co/\w+', '', x))
#train['text'] = train['text'].apply(lambda x: correct_spellings(x))
train['text'] = train['text'].str.lower() #

In [406]:
remove_html('hello &amp')

'hello &amp'

In [407]:
print(re.sub(r'\b@/\w+', '', 'hello world @remove'))

hello world @remove


In [408]:
train = train.fillna('na')
test = test.fillna('na')

del train['keyword']
del train['location']
del test['keyword']
del test['location']

train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 3 columns):
id        7613 non-null int64
text      7613 non-null object
target    7613 non-null int64
dtypes: int64(2), object(1)
memory usage: 178.6+ KB


In [409]:
#train2 = train.copy()
#del train2['target']
#entities = {
    #'text' : (train2, 'id')
#}

#trans = [DiversityScore, LSA, MeanCharactersPerWord,
    #PartOfSpeechCount, PunctuationCount, StopwordCount,
    #TitleWordCount, UpperCaseCount]

#features, feature_names = ft.dfs(entities = entities, target_entity = 'text', 
    #trans_primitives = trans, verbose = True,
    #features_only = False, max_depth = 1)

In [410]:
#entities2 = {
    #'text2' : (test, 'id')
#}

#trans2 = [DiversityScore, LSA, MeanCharactersPerWord,
    #PartOfSpeechCount, PunctuationCount, StopwordCount,
    #TitleWordCount, UpperCaseCount]

#features2, feature_names2 = ft.dfs(entities = entities2, target_entity = 'text2', 
    #trans_primitives = trans2, verbose = True,
    #features_only = False, max_depth = 1)

In [411]:
#print(features.shape)
#print(features2.shape)

In [412]:
#pol = PolarityScore()
#train['polarity'] = train['text'].apply(lambda x: pol(x))
#test['polarity'] = test['text'].apply(lambda x: pol(x))

In [413]:
from nltk.corpus import stopwords
import nltk
stop_words = stopwords.words('english')
stop_words[:5]

['i', 'me', 'my', 'myself', 'we']

In [414]:
X, y = train.drop('target', axis = 1), train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 1)

In [418]:
#insantiate and fit transform tfidf vectorizer (term frequency) 20214 20129 20092 20088 20086 20084 20030

scores = []

for i in range(5000, 25000, 1000):
    vect = CountVectorizer(
        #analyzer = lambda x: word_tokenize(x, how = 'stem'),
        stop_words = stop_words,
        strip_accents = 'unicode',
        max_features = i,
        lowercase = True, #not working
        ngram_range = (1, 1) #not working
    )

    #create document term matrices

    train_dtm = pd.DataFrame(vect.fit_transform(X_train['text']).todense(), columns = vect.get_feature_names())
    test_dtm = pd.DataFrame(vect.transform(X_test['text']).todense(), columns = vect.get_feature_names())

    #ensure columns match

    #print(train_dtm.shape)
    #print(test_dtm.shape)

    logreg = LogisticRegression(solver = 'lbfgs', multi_class = 'ovr')
    logreg.fit(train_dtm, y_train)
    preds = pd.DataFrame(logreg.predict(test_dtm), columns = ['target'])
    scores.append(i)
    print(str(i) + " = " + str(accuracy_score(y_test, preds)))

#8004201680672269 @10,000 CV lemma
#803046218487395 @9000 Tfidf lemma
#805672268907563 @7000 Tfidf stem

5000 = 0.7904411764705882
6000 = 0.7883403361344538
7000 = 0.7909663865546218
8000 = 0.7888655462184874
9000 = 0.789390756302521
10000 = 0.7904411764705882
11000 = 0.7909663865546218
12000 = 0.7941176470588235
13000 = 0.7935924369747899
14000 = 0.792016806722689
15000 = 0.792016806722689
16000 = 0.792016806722689
17000 = 0.792016806722689
18000 = 0.792016806722689


KeyboardInterrupt: 

In [416]:
print(train_dtm.shape)
print(test_dtm.shape)

(5709, 13901)
(1904, 13901)


In [370]:
vect = TfidfVectorizer(
    stop_words = stop_words,
    lowercase = True, #not working
    ngram_range = (1, 1) #not working
)

s = ['I went to the store', 'the', 'the cat is sleeping']
dtm = pd.DataFrame(vect.fit_transform(s).todense(), columns = vect.get_feature_names())
dtm

Unnamed: 0,cat,sleeping,store,went
0,0.0,0.0,0.707107,0.707107
1,0.0,0.0,0.0,0.0
2,0.707107,0.707107,0.0,0.0


In [357]:
print(vect.get_feature_names())



In [338]:
#insantiate and fit transform tfidf vectorizer (term frequency)

#vect = TfidfVectorizer(
    #analyzer = lambda x: word_tokenize(x, how = 'stem'),
    #ngram_range = (2, 2),
    #stop_words = 'english',
    #strip_accents = 'unicode'
#)

#create document term matrices

#train_dtm2 = pd.DataFrame(vect.fit_transform(train['keyword']).todense(), columns = vect.get_feature_names())
#test_dtm2 = pd.DataFrame(vect.transform(test['keyword']).todense(), columns = vect.get_feature_names())

#ensure columns match

#print(train_dtm2.shape)
#print(test_dtm2.shape)

In [339]:
#insantiate and fit transform tfidf vectorizer (term frequency)

#vect = TfidfVectorizer(
    #analyzer = lambda x: word_tokenize(x, how = 'stem'),
    #ngram_range = (2, 2),
    #stop_words = 'english',
    #strip_accents = 'unicode'
#)

#create document term matrices

#train_dtm3 = pd.DataFrame(vect.fit_transform(train['location']).todense(), columns = vect.get_feature_names())
#test_dtm3 = pd.DataFrame(vect.transform(test['location']).todense(), columns = vect.get_feature_names())

#ensure columns match

#print(train_dtm3.shape)
#print(test_dtm3.shape)

In [340]:
#train_dtm.reset_index(drop = True, inplace = True)
#test_dtm.reset_index(drop = True, inplace = True)
#features.reset_index(drop = True, inplace = True)
#features2.reset_index(drop = True, inplace = True)

In [341]:
#train_dtm_test = pd.concat([train_dtm, features], axis = 1)
#test_dtm_test = pd.concat([test_dtm, features2], axis = 1)

In [271]:
logreg = LogisticRegression()
logreg.fit(train_dtm, y_train)
preds = pd.DataFrame(logreg.predict(test_dtm), columns = ['target'])
print(accuracy_score(y_test, preds))

0.7967436974789915




In [419]:
vect = TfidfVectorizer(
    #analyzer = lambda x: word_tokenize(x, how = 'lemma'),
    stop_words = 'english',
    strip_accents = 'unicode',
    lowercase = True, #not working
    ngram_range = (1, 1) #not working
)

train_dtm = pd.DataFrame(vect.fit_transform(train['text']).todense(), columns = vect.get_feature_names())
test_dtm = pd.DataFrame(vect.transform(test['text']).todense(), columns = vect.get_feature_names())

logreg = LogisticRegression(solver = 'lbfgs')
logreg.fit(train_dtm, train['target'])
preds = pd.DataFrame(logreg.predict(test_dtm), columns = ['target'])

In [420]:
test_ids = test['id']
result = pd.DataFrame(test_ids, columns = ['id'])
result['target'] = preds

In [421]:
result.to_csv('submission.csv', index = False)