In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("../input/nlp-getting-started/train.csv")
test = pd.read_csv("../input/nlp-getting-started/test.csv")
sample_submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print(train.apply(lambda col: col.unique()))
print(train.apply(lambda col: col.nunique()))

In [None]:
!pip install spacy -q
!python -m spacy download en_core_web_sm -q

# spaCy pipeline starter code - https://www.dataquest.io/blog/tutorial-text-classification-in-python-using-spacy/

# Imports

In [None]:
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import spacy
import en_core_web_sm
import re

# Custom transformer using spaCy

In [None]:
from sklearn.model_selection import train_test_split

X = train['text'] + ' ' +  train['keyword'].astype(str) + ' ' +  train['location'].astype(str) # the features we want to analyze
ylabels = train['target'] # the labels, or answers, we want to test against

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.3)

In [None]:
X_train[100:500]
#type(X_train[1])
#y_train[:100]

In [None]:

punctuations = string.punctuation 
nlp = spacy.load('en_core_web_sm') #, exclude=["tok2vec", "parser", "ner", "attribute_ruler"]
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English() # Load English tokenizer, tagger, parser, NER and word vectors

def spacy_tokenizer(sentence):
    mytokens = str(sentence)
    mytokens = nlp(mytokens)
    #mytokens = parser(sentence) 
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] 
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]    
    return mytokens      # return preprocessed list of tokens

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

def clean_text(text):
    text =  text.strip().lower()
    #text = re.sub(r'[^A-Za-z0-9 ]+', '', text)
    return text #.split()

bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(1,1), stop_words = None)
tfidf_vector = TfidfVectorizer(tokenizer = spacy_tokenizer,  stop_words = None) #token_pattern='(?u)\b\w\w+\b', stop_words = 'english'

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
classifier = LogisticRegression()
# classifier = RandomForestClassifier()

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfidf_vector),
                 ('classifier', classifier)])

#clean_text(X_train[1773])
#spacy_tokenizer(X_train[1773])
#mytokens = parser(X_train[1773])

# mytokens = str(X_train[1773])
# #mytokens = re.sub(r'[^A-Za-z0-9 ]+', '', mytokens)
# #mytokens = parser(mytokens)
# mytokens = nlp(mytokens)
# mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
# print(mytokens)

In [None]:
pipe.fit(X_train, y_train)

In [None]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

# Model Accuracy
print("Accuracy:",metrics.accuracy_score(y_test, predicted))
print("Precision:",metrics.precision_score(y_test, predicted))
print("Recall:",metrics.recall_score(y_test, predicted))

In [None]:
predicted_df = pd.DataFrame(predicted)
predicted_df.value_counts()

In [None]:
predicted_df.plot.hist()

In [None]:
predicted_df.head()

# Generate Submission

In [None]:
test.head()

In [None]:
sample_submission.head()

In [None]:
predicted

In [None]:
test

In [None]:
my_submission_preds = pipe.predict(test['text']+ ' ' +  test['keyword'].astype(str) + ' ' +  test['location'].astype(str))

my_submission = pd.DataFrame({"id":test['id'], 'target':my_submission_preds})

In [None]:
my_submission.head()

In [None]:
len(my_submission)

In [None]:
my_submission.to_csv('submission.csv', index=False)