In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
import random
from nltk.stem.snowball import SnowballStemmer 
from nltk.corpus import stopwords
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import eli5
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import gensim
from sklearn import manifold
from gensim.models.word2vec import Word2Vec
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

## Load the data and split it

In [None]:
test = pd.read_csv('../input/nlp-getting-started/test.csv')
train = pd.read_csv('../input/nlp-getting-started/train.csv')
X = train['text']
y = train['target']
test = test['text']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)
X_train, X_test, y_train, y_test = list(X_train), list(X_test), list(y_train), list(y_test)

## Make tokenizer and stemmer for cleaning data

In [None]:
stop = stopwords.words('english')
def tokenizer(text):
    tokenized = []
    for string in text:
        string = re.sub('[^a-z\sA-Z]', '', string)
        string = re.sub('http\S+', '', string)
        tokenized.append([w for w in string.split() if w not in stop])
    return tokenized

snow_stemmer = SnowballStemmer(language='english') 
def stemmer(text):
    stem_string = []
    for string in text: 
        stem_string.append([snow_stemmer.stem(word) for word in string])
    return stem_string 

X_train = tokenizer(X_train)
X_train = stemmer(X_train)
test = tokenizer(test)
test = stemmer(test)
X_test = tokenizer(X_test)
X_test = stemmer(X_test)

X_train_corrected = [" ".join(x) for x in X_train]
X_test_corrected = [" ".join(x) for x in X_test]
test_corrected = [" ".join(x) for x in test]


## Compute Tfid 

In [None]:
tfidf = TfidfVectorizer(lowercase=False, stop_words='english', preprocessor=None)
X_tfidf_train = tfidf.fit_transform(X_train_corrected)
X_tfidf_test = tfidf.transform(X_test_corrected )
test = tfidf.transform(test_corrected)

X_tfidf_train.shape, X_tfidf_test.shape

## Find params for logistic regression based on Tfid features

In [None]:
logit = LogisticRegression(random_state=0)
param = {'C': [0.1, 1, 2, 3],
        'solver': ['lbfgs', 'liblinear']}
logit_grid = GridSearchCV(estimator = logit, param_grid = param, 
                          scoring = 'f1', n_jobs = -1,)
logit_grid.fit(X_tfidf_train, y_train)
logit_grid.best_params_

In [None]:
logit = LogisticRegression(C = 4, solver = 'lbfgs',random_state=0)
logit.fit(X_tfidf_train, y_train)

## Show F1 score and accuracy

In [None]:
y_pred = logit.predict(X_tfidf_test)
print('F1 = ', f1_score(y_true = y_test, y_pred = y_pred))
print('Accuracy = ', precision_score(y_true = y_test, y_pred = y_pred))

## Now let`s show weights of words importance 

In [None]:
eli5.show_weights(estimator=logit, feature_names = list(tfidf.get_feature_names()),
                 top=(20, 20))

###  Interestingly that a word 'california' has quite big weight.
### Now we will try to create words vectors using word2vec and vizualize it with 3D scatter plot

In [None]:
corpus = X_train
nlp = gensim.models.word2vec.Word2Vec(corpus, size=200,   
            window=6, min_count=1, sg=1, iter=30)

### First of all, let`s see how well our model understood the text and what words it considers to be close by meaning to the word 'fire'.

In [None]:
nlp.most_similar("fire")

### Not very good. But it still makes sense because it also shows 'forest', 'wild'. Model could work better if we better cleaned text and have played with parameters of word2vec. 

### Here we just reduce dimension of vector embeddings from 200 to 3. These 3 dimensions are our new axis for 3D scatter plot. 

In [None]:
## choose a word 'fire' to compute its weights
fig = plt.figure()
word = "fire"
tot_words = [word] + [tupla[0] for tupla in 
                 nlp.most_similar(word, topn=20)]
X = nlp[tot_words]

## pca to reduce dimensionality from 300 to 3
pca = manifold.TSNE(perplexity=40, n_components=3, init='pca')
X = pca.fit_transform(X)

## create data frame with 3 axis
dtf_ = pd.DataFrame(X, index=tot_words, columns=["x","y","z"])
dtf_["input"] = 0
dtf_["input"].iloc[0:1] = 1

# plot 3d
fig = plt.figure(figsize = (10,8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(dtf_[dtf_["input"]==0]['x'], 
           dtf_[dtf_["input"]==0]['y'], 
           dtf_[dtf_["input"]==0]['z'], c="black")
ax.scatter(dtf_[dtf_["input"]==1]['x'], 
           dtf_[dtf_["input"]==1]['y'], 
           dtf_[dtf_["input"]==1]['z'], c="red")
ax.set(xlabel=None, ylabel=None, zlabel=None, xticklabels=[], 
       yticklabels=[], zticklabels=[])
for label, row in dtf_[["x","y","z"]].iterrows():
    x, y, z = row
    ax.text(x, y, z, s=label)
plt.show()

It seems to me that our vector embeddings reflects the meaning of words not bad if we take into account that it doesn`t have configured parameters

In [None]:
y_pred = logit.predict(test)
new_id = pd.read_csv('../input/nlp-getting-started/test.csv') 
ids = list(new_id.id)
final_submission = pd.DataFrame({'id': ids, 'target':y_pred})
final_submission.to_csv('final_submission.csv', index = False) 