# Kaggle Challenge Submission file 

<br>

- ## **_word embeddings LogisticRegression_**


## **Loading Data**

In [116]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import string
import re

import nltk
nltk.download("stopwords")
nltk.download("punkt")
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB

import warnings

warnings.filterwarnings('ignore')

stopword = set(stopwords.words('english'))
RSEED = 42

[nltk_data] Downloading package stopwords to /Users/xuxu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/xuxu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/xuxu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/xuxu/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/xuxu/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [64]:
df_test = pd.read_csv("../data/test.csv")

In [65]:
df_test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


## Data cleaning and feature engineering

In [66]:
def remove_url(text):
    url = re.compile(r'(https?:\/\/)(\s)*(www\.)?(\s)*((\w|\s)+\.)*([\w\-\s]+\/)*([\w\-]+)((\?)?[\w\s]*=\s*[\w\%&]*)*')
    return url.sub(r'', text)

def remove_emoji(text):
    emoji = re.compile("["
        u"\U0001F600-\U0001F64F"  # emotions
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                           "]+", flags=re.UNICODE)
    return emoji.sub(r'', text)

def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return html.sub(r'', text)

def remove_punctuation(text):
    punc = str.maketrans('', '', string.punctuation)
    return text.translate(punc)

In [69]:
df_test["text_clean"] = df_test["text"].apply(lambda x: remove_url(x))
df_test["text_clean"] = df_test["text_clean"].apply(lambda x: remove_emoji(x))
df_test["text_clean"] = df_test["text_clean"].apply(lambda x: remove_html(x))
df_test["text_clean"] = df_test["text_clean"].apply(lambda x: remove_punctuation(x))

In [70]:
df_test["text_token"] = df_test["text_clean"].apply(word_tokenize)
df_test["text_token"] = df_test["text_token"].apply(lambda x: [word.lower() for word in x])
df_test["text_final"] = df_test["text_token"].apply(lambda x: [word for word in x if word not in stopword])
df_test["pos_tags"] = df_test["text_final"].apply(nltk.tag.pos_tag)

In [72]:
def convert_to_wordnet(tag):
    if tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

df_test["wordnet_tag"] = df_test["pos_tags"].apply(lambda x: [(word, convert_to_wordnet(pos_tag)) for (word, pos_tag) in x])

In [73]:
wnl = WordNetLemmatizer()
df_test["lemmatize"] = df_test["wordnet_tag"].apply(lambda x: [wnl.lemmatize(word, tag) for word, tag in x])
df_test["lemmatize"] = df_test["lemmatize"].apply(lambda x: [word for word in x if word not in stopword])
df_test["text_lemma"] = [' '.join(map(str, x)) for x in df_test["lemmatize"]]

In [74]:
df_test["char_count1"] = df_test["text_clean"].apply(len) 
df_test["char_count2"] = df_test["text_lemma"].apply(len)
df_test['word_count1'] = df_test['text_clean'].apply(lambda x: len(str.split(x)))
df_test['word_count2'] = df_test['text_lemma'].apply(lambda x: len(str.split(x)))
df_test['mword_leng1'] = df_test['text_clean'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x: np.mean(x))
df_test['mword_leng2'] = df_test['text_lemma'].str.split().apply(lambda x: [len(i) for i in x]).map(lambda x: np.mean(x))

In [75]:
df_test.head(20)

Unnamed: 0,id,keyword,location,text,text_clean,text_token,text_final,pos_tags,wordnet_tag,lemmatize,text_lemma,char_count1,char_count2,word_count1,word_count2,mword_leng1,mword_leng2
0,0,,,Just happened a terrible car crash,Just happened a terrible car crash,"[just, happened, a, terrible, car, crash]","[happened, terrible, car, crash]","[(happened, VBN), (terrible, JJ), (car, NN), (...","[(happened, v), (terrible, a), (car, n), (cras...","[happen, terrible, car, crash]",happen terrible car crash,34,25,6,4,4.833333,5.5
1,2,,,"Heard about #earthquake is different cities, s...",Heard about earthquake is different cities sta...,"[heard, about, earthquake, is, different, citi...","[heard, earthquake, different, cities, stay, s...","[(heard, RB), (earthquake, NN), (different, JJ...","[(heard, r), (earthquake, n), (different, a), ...","[heard, earthquake, different, city, stay, saf...",heard earthquake different city stay safe ever...,61,50,9,7,5.888889,6.285714
2,3,,,"there is a forest fire at spot pond, geese are...",there is a forest fire at spot pond geese are ...,"[there, is, a, forest, fire, at, spot, pond, g...","[forest, fire, spot, pond, geese, fleeing, acr...","[(forest, JJS), (fire, NN), (spot, NN), (pond,...","[(forest, a), (fire, n), (spot, n), (pond, n),...","[forest, fire, spot, pond, geese, flee, across...",forest fire spot pond geese flee across street...,94,51,19,9,4.0,4.777778
3,9,,,Apocalypse lighting. #Spokane #wildfires,Apocalypse lighting Spokane wildfires,"[apocalypse, lighting, spokane, wildfires]","[apocalypse, lighting, spokane, wildfires]","[(apocalypse, NN), (lighting, VBG), (spokane, ...","[(apocalypse, n), (lighting, v), (spokane, n),...","[apocalypse, light, spokane, wildfire]",apocalypse light spokane wildfire,37,33,4,4,8.5,7.5
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,Typhoon Soudelor kills 28 in China and Taiwan,"[typhoon, soudelor, kills, 28, in, china, and,...","[typhoon, soudelor, kills, 28, china, taiwan]","[(typhoon, NN), (soudelor, NN), (kills, VBZ), ...","[(typhoon, n), (soudelor, n), (kills, v), (28,...","[typhoon, soudelor, kill, 28, china, taiwan]",typhoon soudelor kill 28 china taiwan,45,37,8,6,4.75,5.333333
5,12,,,We're shaking...It's an earthquake,Were shakingIts an earthquake,"[were, shakingits, an, earthquake]","[shakingits, earthquake]","[(shakingits, NNS), (earthquake, NN)]","[(shakingits, n), (earthquake, n)]","[shakingits, earthquake]",shakingits earthquake,29,21,4,2,6.5,10.0
6,21,,,They'd probably still show more life than Arse...,Theyd probably still show more life than Arsen...,"[theyd, probably, still, show, more, life, tha...","[theyd, probably, still, show, life, arsenal, ...","[(theyd, NN), (probably, RB), (still, RB), (sh...","[(theyd, n), (probably, r), (still, r), (show,...","[theyd, probably, still, show, life, arsenal, ...",theyd probably still show life arsenal yesterd...,68,54,12,9,4.75,5.111111
7,22,,,Hey! How are you?,Hey How are you,"[hey, how, are, you]",[hey],"[(hey, NN)]","[(hey, n)]",[hey],hey,15,3,4,1,3.0,3.0
8,27,,,What a nice hat?,What a nice hat,"[what, a, nice, hat]","[nice, hat]","[(nice, JJ), (hat, NN)]","[(nice, a), (hat, n)]","[nice, hat]",nice hat,15,8,4,2,3.0,3.5
9,29,,,Fuck off!,Fuck off,"[fuck, off]",[fuck],"[(fuck, NN)]","[(fuck, n)]",[fuck],fuck,8,4,2,1,3.5,4.0


In [77]:
df_test['text_lemma'][13] = 'what if'

## **Building Word embeddings Model**

In [78]:
import spacy

In [79]:
df_train = pd.read_pickle("../data/preprocess_train.pkl")

In [83]:
nlp = spacy.load('en_core_web_lg')

def sent_vectorizer(sent):
    doc = nlp(sent)
    sent_vec =[]
    numw = 0
    for token in doc:
        try:
            if numw == 0:
                sent_vec = token.vector
            else:
                sent_vec = np.add(sent_vec, token.vector)
            numw+=1
        except:
            pass
    
    return np.asarray(sent_vec)
  
# Saving the embeddings in a list X
X_test=[]
for sentence in df_test['text_lemma']:
    X_test.append(sent_vectorizer(sentence))

X_train=[]
for sentence in df_train['text_lemma']:
    X_train.append(sent_vectorizer(sentence))

In [84]:
print('Number of X_train: ', len(X_train))
print('Number of X_test: ', len(X_test))

Number of X_train:  7613
Number of X_test:  3263


### **_PCA for Dimension Reduction_**

In [98]:
# Using PCA for Dimensionality Reduction
# And the StandardScaler to scale the data 

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

scaler = StandardScaler()
X_scaled_train = scaler.fit_transform(X_train)
X_scaled_test = scaler.transform(X_test)
pca_ = PCA(0.99, random_state=RSEED)
X_pca_train=pca_.fit_transform(X_scaled_train)
X_pca_test=pca_.transform(X_scaled_test)

In [99]:
features_train = pd.DataFrame(X_pca_train)
features_test = pd.DataFrame(X_pca_test)
#df_new = pd.concat([features, df['char_count2'], df_spacy['noun']], axis=1)
df_new_train = pd.concat([features_train, df_train['char_count2']], axis=1)
df_new_test = pd.concat([features_test, df_test['char_count2']], axis=1)

In [100]:
features_train.shape

(7613, 249)

In [101]:
features_test.shape

(3263, 249)

In [102]:
df_new_test.shape

(3263, 250)

In [103]:
df_new_train.shape

(7613, 250)

In [119]:
clf_ed_pca = LogisticRegression(random_state=RSEED, max_iter=500)

clf_ed_pca.fit(df_new_train, df_train.target)
accuracy = clf_ed_pca.score(df_new_train, df_train.target)
print(accuracy)

0.8121634047024826


In [105]:
prediction = clf_ed_pca.predict(df_new_test)

In [106]:
df_sub = pd.read_csv('../data/sample_submission.csv')

In [107]:
df_sub.shape

(3263, 2)

In [108]:
df_sub['target'] = prediction

In [111]:
df_sub.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0


In [112]:
df_sub.to_csv("../data/submission.csv", index=False)

In [113]:
sub = pd.read_csv("../data/submission.csv")

In [114]:
sub.head(10)

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
5,12,1
6,21,0
7,22,0
8,27,0
9,29,0
