In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sn

from pprint import pprint

from sklearn.metrics import auc, roc_curve, plot_roc_curve
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import ComplementNB

import gc
import string
import re

# Reading the dataset

In [None]:
train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [None]:
print('train shape:', train.shape)
print('test shape:', test.shape)

In [None]:
train.head()

In [None]:
test.head()

# Data preprocessing

Dealing with missing data

In [None]:
train['keyword'].fillna('', inplace=True)
train['location'].fillna('', inplace=True)


test['keyword'].fillna('', inplace=True)
test['location'].fillna('', inplace=True)

In [None]:
train['final_text'] = train['keyword'] + ' ' + train['text'] + ' ' + train['location']
test['final_text'] = test['keyword'] + ' ' + test['text'] + ' ' + test['location']

Cleaning the text 

In [None]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'', text)


def remove_html(text):
    html = re.compile(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    return re.sub(html, '', text)


def remove_punct(text):
    table = str.maketrans('', '', string.punctuation)
    return text.translate(table)

In [None]:
train['final_text'] = train['final_text'].apply(lambda x: remove_URL(x))
train['final_text'] = train['final_text'].apply(lambda x: remove_html(x))
train['final_text'] = train['final_text'].apply(lambda x: remove_punct(x))

test['final_text'] = test['final_text'].apply(lambda x: remove_URL(x))
test['final_text'] = test['final_text'].apply(lambda x: remove_html(x))
test['final_text'] = test['final_text'].apply(lambda x: remove_punct(x))

# Applying vectorization to our data

In [None]:
count_vectorizer = CountVectorizer()

In [None]:
train_vectors = count_vectorizer.fit_transform(train["final_text"])

In [None]:
test_vectors = count_vectorizer.transform(test["final_text"])

In [None]:
y = train['target']

In [None]:
del train, test
gc.collect()

# Using Complement Naive Bayes Classifier

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    train_vectors.toarray(), y, test_size=0.33, random_state=42)

In [None]:
del train_vectors
gc.collect()

In [None]:
clf = ComplementNB()

In [None]:
clf.fit(X_train, y_train)

In [None]:
del X_train, y_train
gc.collect()

In [None]:
pred = clf.predict(X_test)

# Measuring performance

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, pred)
print(auc(fpr, tpr))

plot_roc_curve(clf, X_test, y_test)
plt.show()

In [None]:
del X_test, y_test
gc.collect()

# Exporting predictions to appropriate submission format

In [None]:
y_pred = clf.predict(test_vectors.toarray())

In [None]:
data = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')
data['target'] = y_pred
df = pd.DataFrame(data=data)
df.head()

In [None]:
df.to_csv('submission.csv', index=False)