In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

import re

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train_data = pd.read_csv('../input/nlp-getting-started/train.csv')
test_data = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train_data = train_data.fillna('NaN')
test_data = test_data.fillna('NaN')

In [None]:
train_data.head()

In [None]:
train_data.iloc[2]['text']

In [None]:
link = re.compile(r'http')
caps = re.compile(r'[A-Z]{3,9}')

train_data['contains_link'] = train_data.text.apply(lambda x: 0 if link.search(x) is None else 1)
train_data['contains_caps'] = train_data.text.apply(lambda x: 0 if caps.search(x) is None else 1)
print(train_data.contains_link.value_counts())
train_data.groupby(['contains_link', 'contains_caps']).target.value_counts(normalize = True)

In [None]:
train_data.location.value_counts()

In [None]:
train_data.groupby('keyword').target.value_counts(normalize = True, sort = False)

strategies for prediction
1. Use only 'keyword' column for prediction.
2. Use only tweet column for prediction. MutinomialNB
3. Combine both (how to combine?)

In [None]:
table = train_data.groupby('keyword').target.value_counts(normalize = True, sort = False).reset_index(level = [0])

table_1 = table[table.index == 1]
table_0 = table[table.index == 0]

keyword_dict = {}

for i in range(len(table_1)):
    keyword_dict[table_1.iloc[i]['keyword']] = table_1.iloc[i]['target']

for i in range(len(table_0)):
    if table_0.iloc[i]['keyword'] not in keyword_dict:
        keyword_dict[table_0.iloc[i]['keyword']] = 0


In [None]:
keyword_dict['wreckage']

In [None]:
print(table_1['keyword'])

In [None]:
train_data.text = train_data.text.str.replace(r'\W', ' ').str.lower()
train_data.head()

In [None]:
vectorizer = CountVectorizer(ngram_range = (1,2), min_df = 0.0001, 
                             stop_words = 'english', binary = False)

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(train_data.text, 
                                                     train_data.target,
                                                     train_size = 0.8,
                                                     stratify = train_data.target)

In [None]:
train_vector = vectorizer.fit_transform(X_train)
test_vector = vectorizer.transform(X_test)

In [None]:
classifier = MultinomialNB()
classifier.fit(train_vector, y_train)

In [None]:
print(confusion_matrix(y_train, classifier.predict(train_vector)))

In [None]:
y_pred = classifier.predict(test_vector)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
classifier.predict_proba(test_vector)[:, 1]

In [None]:
train_data['naive_probability'] = train_data.text.apply(lambda x: classifier.predict_proba(vectorizer.transform([x]))[:, 1][0])
train_data.head()

In [None]:
train_data['keyword_probability'] = train_data.keyword.apply(lambda x: keyword_dict[x])
train_data.head()

In [None]:
ct = ColumnTransformer(
    [('ohe', OneHotEncoder(handle_unknown = 'ignore'), ['keyword'])],
remainder = 'drop')
ohe = OneHotEncoder()

In [None]:
train_data['keyword']

In [None]:
keyword_encoded = ohe.fit_transform(train_data[['keyword']])

keyword_naive_bayes = MultinomialNB()
keyword_naive_bayes.fit(keyword_encoded, train_data.target)

train_data['encoded_keyword_probability'] = keyword_naive_bayes.predict_proba(keyword_encoded)[:,1]

In [None]:
keyword_encoded[0,4]

In [None]:
train_data.describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.scatter(train_data['encoded_keyword_probability'], train_data['naive_probability'], c = train_data.target, alpha = 0.2)

In [None]:
train_data['temp_param'] = train_data.text.apply(lambda x: len(x))

In [None]:
features = ['encoded_keyword_probability','keyword_probability', 'naive_probability', 'contains_link', 'contains_caps']

In [None]:
sns.pairplot(train_data[features + ['target']], hue = 'target')

In [None]:
plt.scatter(train_data['naive_probability'], train_data['temp_param'], c = train_data.target, alpha = 0.2)

In [None]:
X_train, X_test, y_train, y_test =  train_test_split(train_data[features], 
                                                     train_data.target,
                                                     train_size = 0.8,
                                                     stratify = train_data.target)

In [None]:
combined_classifier = LogisticRegression()
combined_classifier.fit(X_train, y_train)

In [None]:
y_pred = combined_classifier.predict(X_train)
print(confusion_matrix(y_train, y_pred))
print(classification_report(y_train, y_pred))

In [None]:
y_pred = combined_classifier.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

In [None]:
'''print(combined_classifier.coef_)
print(combined_classifier.intercept_)'''

In [None]:
test_data.text = test_data.text.str.replace(r'\W', ' ').str.lower()
test_data.head()

In [None]:
test_data['naive_probability'] = test_data.text.apply(lambda x: classifier.predict_proba(vectorizer.transform([x]))[:, 1][0])
test_data.head()

In [None]:
test_data['keyword_probability'] = test_data.keyword.apply(lambda x: keyword_dict[x])
test_data.head()

In [None]:
test_data['target'] = combined_classifier.predict(test_data[features])
test_data.head()

In [None]:
test_data[['id', 'target']].to_csv('submission.csv', 
                                   index = False)