In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
print(os.getcwd())
print(os.listdir('/kaggle/input/'))

## *Import libraries*

In [None]:
from nltk import TweetTokenizer
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn import svm
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from collections import Counter

%matplotlib inline

# *Data Manipulation*

In [None]:
train_set = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_set = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

train_set

## *Filter data*

In [None]:
feature_columns = ['location', 'keyword', 'text']

train_set.dropna(axis=0, subset=['target'])

train_x = train_set[feature_columns]
train_y = train_set['target']

test_x = test_set[feature_columns]

assert len(train_x.columns), len(text_x.columns) == (3, 3)

In [None]:
train_x.head()

## *Cleaning data*

## *Tokenize*

<div style="width:100%; text-align:center">
<img align=middle src=https://cdn.analyticsvidhya.com/wp-content/uploads/2019/11/tokenization.png alt="tokenize" width=400px>

In [None]:
tokenizer = TweetTokenizer()

tokens = [tokenizer.tokenize(word) for word in train_x['text']]
train_x_tk = train_x.assign(tokens = tokens)

train_x_tk

## *Stem and Lemmatize*

<div style="width:100%; text-align:center">
<img align=middle src=https://miro.medium.com/max/1400/1*ES5bt7IoInIq2YioQp2zcQ.png alt="stem vs lemmatize" width=500px>

In [None]:
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

# Turn tokens into lemmatized/stemmed strings
def lemmatize_stem_item(item):
    new_item = []
    for x in item:
        x = lemmatizer.lemmatize(x)
        x = stemmer.stem(x)
        new_item.append(x)
    return " ".join(new_item)

# If stemmed column has not been created -> tokens are still arrays
if not 'stemmed' in train_x_tk:
    train_x_tk.tokens = [lemmatize_stem_item(item) for item in train_x_tk.tokens]
    train_x_tk['stemmed'] = True

train_x_tk

## *Vectorize*

<div style="width:100%; text-align:center">
<img align=middle src=https://user.oc-static.com/upload/2020/10/23/16034397439042_surfin%20bird%20bow.png alt="vectorization table" width=600px>

In [None]:
vectorizer = CountVectorizer()

train_x_vectors = vectorizer.fit_transform(train_x_tk.tokens)
train_x_vectors

## *Train*

In [None]:
clf = svm.SVC(kernel='linear')

print("Mean accuracy in validation set: ", cross_val_score(clf, train_x_vectors, train_y).mean())

In [None]:
clf.fit(train_x_vectors, train_y)

## *Predict*

In [None]:
def extract_data(df):

    df['tokens'] = [tokenizer.tokenize(item) for item in df.text]
    df['tokens'] = [lemmatize_stem_item(item) for item in df.tokens]
    vectors = vectorizer.transform(df.tokens)
    
    return vectors

In [None]:
result = clf.predict(extract_data(test_x))
result

## *Visualize*

In [None]:
fig, ax = plt.subplots(figsize=(4,4), dpi=100)

y_plt = Counter(result)
y_plt = [y_plt.get(0), y_plt.get(1)]

ax.set_xticks([0, 1])
ax.set_xlabel('target')
ax.set_ylabel('occurance')
ax.pie(y_plt, explode=(0, 0.1), labels=('class 0', 'class 1'),autopct='%.2f%%', shadow=True)

plt.show()

## *Configure Ouput*

In [None]:
final = pd.DataFrame(test_set.id)
final['target'] = result
final

In [None]:
final.to_csv('submission.csv', index=False)