In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Importing Libraries

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from collections import Counter
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from nltk import word_tokenize

# Preprocessing

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')

In [None]:
train.head()

In [None]:
train.isna().sum()

In [None]:
train.info()

In [None]:
train.describe()

In [None]:
train.location.value_counts()

In [None]:
train.keyword.value_counts()

In [None]:
tokens = [word_tokenize(texts) for texts in train.text]
len_tokens=[]

for i in range(len(tokens)):
    len_tokens.append(len(tokens[i]))

In [None]:
train["text_tokens"] = len_tokens

In [None]:
train.text = train.text.str.replace('[^\w\s]','')
train.text = train.text.str.lower()
train.head()

## Fill Missing Data

In [None]:
X = train.drop("target", axis = 1)
y = train["target"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2)
X_train.shape, y_train.shape, X_val.shape, y_val.shape

In [None]:
cat = ["keyword", "location"]
imputer = SimpleImputer(strategy="constant", fill_value="missing")
transformer = ColumnTransformer([("imputer", imputer, cat)], remainder="passthrough")

# fill simple imputer with X values since you want to fill only the features, not the target
filled_X = transformer.fit_transform(X_train)

filled = pd.DataFrame(filled_X, columns=["id", "keyword", "location", "text", "text_tokens"])
filled.head()

# Data Cleaning

In [None]:
# removing URLs
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    # replace the pattern url in text with None
    return url.sub(r'',text)

In [None]:
# remove html tags
def remove_html(text):
    html = re.compile(r'<.*?>')
    return html.sub(r'',text)

In [None]:
# remove emojis
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)

In [None]:
# remove stopwords
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords]
    return words

In [None]:
import re

In [None]:
train['text'] = train['text'].apply(lambda x : remove_URL(x))
test['text'] = test['text'].apply(lambda x : remove_URL(x))
train.head()

In [None]:
train['text'] = train['text'].apply(lambda x : remove_html(x))
test['text'] = test['text'].apply(lambda x : remove_html(x))
train.head()

In [None]:
train['text'] = train['text'].apply(lambda x : remove_emoji(x))
test['text'] = test['text'].apply(lambda x : remove_emoji(x))
train.head()

## Word Cloud

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
stopwords = ENGLISH_STOP_WORDS

In [None]:
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)

In [None]:
stopwords = set(STOPWORDS)
text = " ".join(text for text in train.text)
cloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
plt.imshow(cloud);

## Data Visualization

In [None]:
# missing values
a = sns.heatmap(train.isnull())
plt.suptitle("Null Values");
plt.xticks(rotation=90);

In [None]:
zero = train[train.target==0].text_tokens
one = train[train.target==1].text_tokens

In [None]:
fig, (ax0, ax1) = plt.subplots(1,2, figsize=(10,5))
ax0.hist(zero,color='purple')
fig.suptitle('Number of Tokens in Text')
ax0.set_title("Not Disaster")
ax1.set_title("Disaster")
ax1.hist(one,color='blue');

In [None]:
zero = train[train.target==0].text.str.len()
one = train[train.target==1].text.str.len()

In [None]:
fig, (ax0, ax1) = plt.subplots(1,2, figsize=(10,5))
ax0.hist(zero,color='purple')
ax0.set_title("Not Disaster")
ax1.set_title("Disaster")
fig.suptitle("Number of Characters in Text")
ax1.hist(one,color='blue');

In [None]:
sns.catplot(x="target",data=train,  kind="count")
plt.suptitle("Target Comparison");

In [None]:
corr = train.corr()
sns.set(rc={'figure.figsize':(5,5)})
sns.heatmap(corr)
plt.suptitle("Correlation");

In [None]:
zero = train[train.target==0].text
one = train[train.target==1].text

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(10,5))

word = one.str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax = ax1,color='blue')
ax1.set_title('Disaster')

word = zero.str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='red')
ax2.set_title('Not Disaster')
fig.suptitle('Average Word Length');

### TF-IDF

In [None]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

In [None]:
stopwords = ENGLISH_STOP_WORDS

In [None]:
vect = TfidfVectorizer(max_features = 10, stop_words=stopwords)
tfIdf = vect.fit(train.text)
X = vect.transform(train.text)
X_df = pd.DataFrame(X.toarray(), columns = vect.get_feature_names())
X_df

### Bag of Words

In [None]:
vectorizer = CountVectorizer(ngram_range=(1,3), max_features = 100, max_df=500, stop_words= stopwords)
vectorizer.fit(train.text)
X = vectorizer.transform(train.text)
BoW = pd.DataFrame(X.toarray(), columns= vectorizer.get_feature_names())
BoW

# Fit a Model

In [None]:
X = BoW
y = train["target"]

In [None]:
X_train , X_test , y_train , y_test = train_test_split(X, y,test_size=0.20, random_state=55, shuffle =True)

In [None]:
from sklearn.tree import DecisionTreeClassifier



decisionTreeModel = DecisionTreeClassifier(criterion= 'entropy',
                                           max_depth = None, 
                                           splitter='best', 
                                           random_state=55)

decisionTreeModel.fit(X_train, y_train);


# ### Gradient Boosting




from sklearn.ensemble import GradientBoostingClassifier





gradientBoostingModel = GradientBoostingClassifier(loss = 'deviance',
                                                   learning_rate = 0.01,
                                                   n_estimators = 100,
                                                   max_depth = 30,
                                                   random_state=55)

gradientBoostingModel.fit(X_train,y_train);


# ### K-Nearest Neighbors




from sklearn.neighbors import KNeighborsClassifier





KNeighborsModel = KNeighborsClassifier(n_neighbors = 7,
                                       weights = 'distance',
                                      algorithm = 'brute')

KNeighborsModel.fit(X_train,y_train);


# ### Logistic Regression Model




from sklearn.linear_model import LogisticRegression





LogisticRegression = LogisticRegression(penalty='l2', 
                                        solver='saga', 
                                        random_state = 55)  

LogisticRegression.fit(X_train,y_train);


# ### Bernoulli Naive Bayes Model



from sklearn.naive_bayes import BernoulliNB




bernoulliNBModel = BernoulliNB(alpha=0.1)
bernoulliNBModel.fit(X_train,y_train);

In [None]:
from sklearn.metrics import f1_score



models = [decisionTreeModel, gradientBoostingModel, KNeighborsModel, LogisticRegression, bernoulliNBModel]

for model in models:
    print(type(model).__name__,' Train Score is   : ' ,model.score(X_train, y_train))
    print(type(model).__name__,' Test Score is    : ' ,model.score(X_test, y_test))
    
    y_pred = model.predict(X_test)
    print(type(model).__name__,' F1 Score is      : ' ,f1_score(y_test ,y_pred))
    print('********************************************************************')

In [None]:
test

In [None]:
TRAIN_FEATURES = ["id", "keyword", "location", "text", "target","text_tokens"]
TEST_FEATURES = ["id", "keyword", "location", "text"]

train[TRAIN_FEATURES].to_pickle('train.pkl')
test[TEST_FEATURES].to_pickle('test.pkl')

print('Training Set Shape = {}'.format(train[TRAIN_FEATURES].shape))
print('Training Set Memory Usage = {:.2f} MB'.format(train[TRAIN_FEATURES].memory_usage().sum() / 1024**2))
print('Test Set Shape = {}'.format(test[TEST_FEATURES].shape))
print('Test Set Memory Usage = {:.2f} MB'.format(test[TEST_FEATURES].memory_usage().sum() / 1024**2))