In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# 1. Load the dataset

In [None]:
df_train = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
df_test = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
df_sample = pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
df_train.head(5)

In [None]:
df_test.head(5)

In [None]:
df_sample.head(5)

# 2. EDA

In [None]:
## check for null values
df_train.isnull().sum()
df_test.isnull().sum()

In [None]:
## Most common keywords in train dataset
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(9,6))
sns.countplot(y = df_train.keyword, order= df_train.keyword.value_counts().iloc[:15].index)
plt.title('Top 15 keywords')
plt.show()

In [None]:
## Plot to checkout null values in train data
df_train.isna().sum().plot(kind = 'bar')
plt.title('Plot to checkout null values in train data')
plt.show()

In [None]:
## Plot to checkout null values in test data
df_test.isna().sum().plot(kind = 'bar')
plt.title('Plot to checkout null values in test data')
plt.show()

Observation: Location has highest na values followed by keyword in both test and train datasets. Anyways, we do not need these features for prediction. Hence, we will drop these features from the dataset.

In [None]:
df_train = df_train.drop(['location','keyword'],axis=1)
df_test = df_test.drop(['location','keyword'],axis=1)

In [None]:
df_train.head(5), df_test.head(5)

In [None]:
## Plot target feature values

sns.countplot(x = 'target', data = df_train)


In [None]:
## Plot length of the train and test dataset

plt.hist(df_train['text'].str.len(), label = 'train_tweets')
plt.hist(df_test['text'].str.len(),label = 'test_tweets')
plt.legend()
plt.show()

In [None]:
## Checkout a disaster tweet
d_t = df_train[df_train['target'] == 1]['text']
for i in range(1,5):
    print(d_t[i])

In [None]:
## Checkout a non disaster tweet
nd_t = df_train[df_train['target'] != 1]['text']
print(nd_t.head(5))

In [None]:
## Word cloud of disaster and non-disaster tweets 
## to see most repeating word

from wordcloud import WordCloud


fig, (ax1, ax2) = plt.subplots(1,2, figsize = [150, 50])

wc1 = WordCloud().generate(''.join(d_t))
ax1.imshow(wc1)
ax1.axis('off')
ax1.set_title('Disaster tweets', fontsize = 18)

wc2 = WordCloud().generate(''.join(nd_t))
ax2.imshow(wc2)
ax2.axis('off')
ax2.set_title('Non Disaster tweets', fontsize = 18)



## Observation:

We observe some words like http , t, co , u^ are most prominent words in tweets and they have to be cleaned for better accurate results and lesser tags.

# 2. Data cleaning

In [None]:
import re
import string
def clean_text(t):
    # Convert to lower
    t = t.lower()
    # remove html tags
    t = re.sub(r'\[.*?\]',' ', t)
    # remove link
    t = re.sub(r'https?://\S+|www\.\S+',' ', t)
    #remove line breaks
    t = re.sub(r'\n',' ',t)
    #Remove trailing spaces, tabs
    t  = re.sub('\s+',' ',t)
    # remove punctuation
#     t = re.sub('[%s]' % re.escape(string.punctuation), t)
    # Remove special characters
    t = re.sub('\w*\d\w*','',t)
    return t

## Apply clean function on random train string 
test_str = df_train.loc[417, 'text']
print('Original text: '+test_str+'\n')
print('Original text after cleaning '+clean_text(test_str))

In [None]:
## Applying clean function on train & test sets
df_train['text'] = df_train['text'].apply(lambda x:clean_text(x))
df_test['text'] = df_test['text'].apply(lambda x:clean_text(x))

## checkout train after cleaning
df_train['text'].head(5)

# 3. Tokenization

Tokenize the cleaned sentences

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import RegexpTokenizer

# tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
tokenizer = RegexpTokenizer(r'\w+')
## Applying tokenization function on train & test sets
df_train['text'] = df_train['text'].map(tokenizer.tokenize)
df_test['text'] = df_test['text'].map(tokenizer.tokenize)
## checkout train dataset tokens
df_train['text'].head(5)


# 4. Stopwords

remove unnecessary words that do not carry any meaning

In [None]:
def remove_stopwords(t):
    words = [w for w in t if w not in stopwords.words('english')]
    return words

df_train['text'] =df_train['text'].apply(lambda x: remove_stopwords(x))
df_test['text'] =df_test['text'].apply(lambda x: remove_stopwords(x))

## checkout train dataset without stopwords
df_train['text'].head(5)

# 5.Lemmatization

Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item.

Examples of lemmatization:

1.playing ,plays and played all these 3 letters will be converted to play after lemmatization

2.change , changing , changes , changed and changer all these letters will be converted to change after lemmatization

In [None]:
def lem_words(t):
    l = WordNetLemmatizer()
    return [l.lemmatize(w) for w in t]

df_train['text'] =df_train['text'].apply(lambda x: lem_words(x))
df_test['text'] =df_test['text'].apply(lambda x: lem_words(x))

## checkout train dataset with lemmatized words
df_train['text'].head(5)

In [None]:
## Transform tokens into sentences 

def combine_txt(t):
    c  = ' '.join(t)
    return c

df_train['text'] =df_train['text'].apply(lambda x: combine_txt(x))
df_test['text'] =df_test['text'].apply(lambda x: combine_txt(x))

## checkout train dataset with lemmatized words
df_train['text'].head(5)

## 6. Vectorizing text

CountVectorizer is used to transform a given text into a vector on the basis of the frequency(count) of each word that occurs in the entire text.It involves counting the number of occurences each words appears in a document(text)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
c = CountVectorizer()
tr_v = c.fit_transform(df_train['text'])
te_v = c.fit_transform(df_test['text'])

print(tr_v[0].todense())


# 7. TFIDF

It stands for Term Frequency-Inverse document frequency.It is a techinque to quantify a word in documents,we generally compute a weight to each word which signifies the importance of the word which signifies the importance of the word in the document and corpus

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1,2))
tr_t = tfidf.fit_transform(df_train['text'])
te_t = tfidf.transform(df_test['text'])



# 8. XGB Classifier

In [None]:
import xgboost as xg
from sklearn.model_selection import cross_val_score
import warnings
warnings.filterwarnings("ignore")
param = xg.XGBClassifier(max_depth = 5, n_estimators = 500,
                        learning_rate = 0.08, nthread = 10, colsample_bytree = 0.8)

vector_score = cross_val_score(param, tr_v, df_train['target'],
                              cv=5, scoring='f1')
print(vector_score)

tfidf_score = cross_val_score(param, tr_t, df_train['target'],
                              cv=5, scoring='f1')
print(tfidf_score)



[0.51104101 0.39851715 0.49324324 0.41164659 0.56161972]

[0.4730473  0.39309684 0.4193849  0.39918117 0.55879752]



# 8. Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lg = LogisticRegression()
vector_score = cross_val_score(lg, tr_v, df_train['target'],
                              cv=5, scoring='f1')
print(vector_score)

tfidf_score = cross_val_score(lg, tr_t, df_train['target'],
                              cv=5, scoring='f1')
print(tfidf_score)

# 9. Naive bayes 

In [None]:
from sklearn.naive_bayes import MultinomialNB as mb

m = mb()
vector_score = cross_val_score(m, tr_v, df_train['target'],
                              cv=5, scoring='f1')
print(vector_score)

tfidf_score = cross_val_score(m, tr_t, df_train['target'],
                              cv=5, scoring='f1')
print(tfidf_score)

## Observation:

Naive bayes has the highest vector and tfidf score among the 3 algos.

# 10. Prediction

In [None]:
m.fit(tr_t,df_train['target'])
pred = m.predict(te_t)

# Submission

In [None]:
s = pd.DataFrame({'Id':df_test['id'],
                 'Target':pred})
s.to_csv('s.csv',index = False)
s = pd.read_csv('s.csv')
s.head(5)