In [None]:
import re
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import nltk
from nltk.stem.snowball import SnowballStemmer

In [None]:
df_train = pd.read_csv("../input/nlp-getting-started/train.csv")
df_test = pd.read_csv("../input/nlp-getting-started/test.csv")
df_submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")

In [None]:
df_train.head()

In [None]:
df_test.head()

In [None]:
df_train.shape, df_test.shape

In [None]:
df_train.duplicated().sum()

In [None]:
df_test.duplicated().sum()

In [None]:
df_train.columns


In [None]:
df_test.columns

In [None]:
df_train["location"].value_counts()

In [None]:
df_test["location"].value_counts()

In [None]:
df_train["keyword"].value_counts()

In [None]:
df_test["keyword"].value_counts()

In [None]:
df_test.drop(["location"], axis=1, inplace=True)

In [None]:
df_test.isnull().sum()

In [None]:
print("Disaster_Tweets_numbers: " +str(len(df_train[df_train["target"]==1])))
print("not Disaster_Tweets_numbers: " +str(len(df_train[df_train["target"]==0])))

In [None]:
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'[^\d\W]+')
df_train['cleaned'] = [tokenizer.tokenize(item) for item in df_train['text']]
df_test['cleaned'] = [tokenizer.tokenize(item) for item in df_test['text']]

In [None]:
# Perform stemming using WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
# Create an English language SnowballStemmer object
stemmer = SnowballStemmer("english")
df_train['cleaned'] = df_train['cleaned'].apply(lambda x: [stemmer.stem(y) for y in x]) # Stem every word.
df_test['cleaned'] = df_test['cleaned'].apply(lambda x: [stemmer.stem(y) for y in x])

In [None]:
df_train['cleaned']

In [None]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df_train['cleaned'] = [item for item in df_train['cleaned'] if item not in stop]
df_test['cleaned'] = [item for item in df_test['cleaned'] if item not in stop] 


In [None]:
df_test['cleaned'] 

In [None]:
df_train['cleaned']

In [None]:
# Remove single char, and char with size 2
def notT(text):
    text = text.apply(lambda x: [item for item in x if len(item)>3])
    return text
df_train['cleaned']= notT(df_train['cleaned'])
df_test['cleaned']= notT(df_test['cleaned'])


In [None]:
df_test['cleaned']

In [None]:
# convert text "list" to string
df_train["cleaned"] = df_train["cleaned"].apply(', '.join)
df_test["cleaned"] = df_test["cleaned"].apply(', '.join)

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from random import choice


stopwords = set(STOPWORDS)

def wordCloud(tokens, plot=1):
    lemtz = WordNetLemmatizer()
    lemmatize_keywords = []
    for token in tokens:
        lemmatize_keywords.append(lemtz.lemmatize(token, wordnet.VERB))
    if plot == 1:
        plot_wordcloud(lemmatize_keywords)
    else:
        return ' '.join(lemmatize_keywords)

def plot_wordcloud(text, bg_color='salmon', cmap='rainbow'):
    c = choice(['Paired','Set2','husl','Spectral','coolwarm'])
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(25,10))
    wordcloud = WordCloud(width=3000, height=2000, background_color=bg_color, colormap=cmap,
                     collocations=False, stopwords=STOPWORDS, random_state=51).generate(' '.join(text))
    ax1.imshow(wordcloud)
    ax1.axis('off')
    labels = pd.Series(data=text).value_counts().index[:20]
    data = pd.Series(data=text).value_counts()[:20]
    sns.barplot(y=labels, x=data, ax=ax2, palette=c)

In [None]:
key_data = df_train['keyword'].fillna('blank').apply(lambda x:re.sub('[^a-zA-Z]+','_', x))
keywords_list = []
for keyword in key_data:
    if keyword != 'blank':
        keywords_list.extend(keyword.split())

In [None]:
wordCloud(keywords_list)

In [None]:
key_data = df_train[df_train['target'] == 0]['keyword'].fillna('blank').apply(lambda x:re.sub('[^a-zA-Z]+','_', x))
keywords_list = []
for keyword in key_data:
    if keyword != 'blank':
        keywords_list.extend(keyword.split())

In [None]:
plot_wordcloud(keywords_list)

In [None]:
key_data = df_train[df_train['target'] == 1]['keyword'].fillna('blank').apply(lambda x:re.sub('[^a-zA-Z]+','_', x))
keywords_list = []
for keyword in key_data:
    if keyword != 'blank':
        keywords_list.extend(keyword.split())

In [None]:
plot_wordcloud(keywords_list)

In [None]:
plt.figure(figsize=(20,6))
labels = df_train['location'].value_counts().index[:20]
data = df_train['location'].value_counts()[:20]
ax = sns.barplot(x=labels, y = data)
for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x()+0.2, p.get_height()+0.5))
ax.set_title('Top 20 Country who tweets', fontsize=20)
ax.set_xticklabels(labels=labels, rotation=45);

In [None]:
plt.figure(figsize=(20,6))
labels = df_train[df_train['target']==1]['location'].value_counts().index[:20]
data = df_train[df_train['target']==1]['location'].value_counts()[:20]
ax = sns.barplot(x=labels, y = data)
for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x()+0.2, p.get_height()+0.5))
ax.set_title('Top 20 Country who tweets Disaster', fontsize=20)
ax.set_xticklabels(labels=labels, rotation=45);

In [None]:
plt.figure(figsize=(20,6))
data = df_train[df_train['target']==0]['location'].value_counts()[:20]
labels = df_train[df_train['target']==0]['location'].value_counts().index[:20]
ax = sns.barplot(x=labels, y = data)
for p in ax.patches:
    ax.annotate(str(int(p.get_height())), (p.get_x()+0.2, p.get_height()+0.5))
ax.set_title('Top 20 Country who tweets Non Disaster', fontsize=20)
ax.set_xticklabels(labels=labels, rotation=90);

In [None]:
from sklearn.model_selection import train_test_split
X, y = df_train['text'], df_train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train).toarray()
X_test_vec = vectorizer.transform(X_test).toarray()

In [None]:
len(vectorizer.get_feature_names())

In [None]:
def evaluate(y_true, y_predicted):
    acc = metrics.accuracy_score(y_true, y_pred)
    precision = metrics.precision_score(y_true, y_pred)
    recall = metrics.recall_score(y_true, y_pred)
    f1 = metrics.f1_score(y_true, y_pred)
    return acc, precision, recall, f1

In [None]:
!pip install mlxtend

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from mlxtend.plotting import plot_confusion_matrix


clf_LR = LogisticRegression().fit(X_train_vec, y_train)
print(clf_LR.score(X_test_vec, y_test))

y_pred = clf_LR.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

df_LR = pd.DataFrame(data=[["Logistic Regressor", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

results_df = pd.DataFrame(data=[["LogisticRegression Classifier", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf_MNB = MultinomialNB().fit(X_train_vec, y_train)
print(clf_MNB.score(X_test_vec, y_test))

y_pred = clf_MNB.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

df_MNB = pd.DataFrame(data=[["Multinomial Naive Bayes Classifier", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_df = results_df.append(df_MNB, ignore_index=True)

In [None]:
from sklearn.naive_bayes import ComplementNB

clf_CNB = ComplementNB().fit(X_train_vec, y_train)
print(clf_CNB.score(X_test_vec, y_test))

y_pred = clf_CNB.predict(X_test_vec)
cm = metrics.confusion_matrix(y_test, y_pred)
fig, ax = plot_confusion_matrix(conf_mat=cm)
plt.show()

df_CNB = pd.DataFrame(data=[["Complement Naive Bayes Classifier", *evaluate(y_test, y_pred)]], 
                          columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
results_df = results_df.append(df_MNB, ignore_index=True)

In [None]:
results_df['Accuracy'].plot(kind='barh', figsize=(12, 8),color=(0.2, 0.4, 0.6, 0.6))

In [None]:
results_df['F1 Score'].plot(kind='barh', figsize=(12, 8),color=[0.2, 0.4, 0.6])

In [None]:
results_df

In [None]:
test_vec = vectorizer.transform(df_test['text']).toarray()
predictions = clf_CNB.predict(test_vec)


In [None]:
df_submissionn = pd.DataFrame(predictions, columns=['target'])
df_submissionn['id'] = df_test['id']
df_submissionn.set_index('id', inplace=True)

df_submissionn.to_csv('submission.csv')

In [None]:
len(predictions)

In [None]:
df_submission.head()

In [None]:
df_test