In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Training

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
train_df.info()

In [None]:
train_df

In [None]:
train_df['text length'] = train_df['text'].apply(len)

In [None]:
train_df.head()

In [None]:
g = sns.FacetGrid(train_df, col='target')
g.map(sns.histplot, 'text length', bins=35)

In [None]:
train_df['location'].isnull().sum()

In [None]:
train_df['keyword'].isnull().sum()

In [None]:
train_df['keyword'].unique()

In [None]:
sns.countplot(data=train_df, x='target')

In [None]:
train_df['keyword'] = train_df['keyword'].str.replace('%20', ' ')

In [None]:
train_df['keyword'].nunique()

In [None]:
train_df[(train_df['target']==0)]

In [None]:
train_df[(train_df['target']==1)]

In [None]:
train_df['keyword'] = train_df['keyword'].fillna(train_df['keyword'].mode()[0])

In [None]:
train_df['location'] = train_df['location'].fillna(train_df['location'].mode()[0])

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(train_df.isna())

In [None]:
train_df['new text'] = train_df['text'] +' '+ train_df['location'] +' '+ train_df['keyword']

In [None]:
train_df['new text'] = train_df['new text'].str.replace('#', ' ')

In [None]:
train_df.head()

# Data Testing

In [None]:
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
test_df

In [None]:
test_df['location'] = test_df['location'].fillna(test_df['location'].mode()[0])

In [None]:
test_df['keyword'] = test_df['keyword'].fillna(test_df['keyword'].mode()[0])

In [None]:
test_df['keyword'] = test_df['keyword'].str.replace('%20', ' ')

In [None]:
test_df['keyword'].nunique()

In [None]:
test_df.isnull().sum()

In [None]:
plt.figure(figsize=(12, 12))
sns.heatmap(test_df.isna())

In [None]:
test_df['new text'] = test_df['text'] +' '+ test_df['location'] +' '+ test_df['keyword']

In [None]:
test_df['new text'] = test_df['new text'].str.replace('#', ' ')

In [None]:
test_df.head()

# Modelling

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [None]:
X_train = train_df['new text']
y_train = train_df['target']
X_test = test_df['new text']

In [None]:
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)

In [None]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),  # strings to token integer counts
    ('classifier', MultinomialNB()),  # train on TF-IDF vectors w/ Naive Bayes classifier
])

can't use the same training data as last time because that data has already been vectorized.

In [None]:
X_train = train_df['new text']
y_train = train_df['target']
X_test = test_df['new text']

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
y_test = pipeline.predict(X_test)

In [None]:
submission_df = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')
submission_df['target'] = y_test
submission_df.to_csv('./submission.csv', index=False)

In [None]:
submission_df = pd.read_csv('submission.csv')
submission_df