In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.patches as mpatches
import seaborn as sns
import scipy.stats.distributions as dist

import nltk
from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize

from wordcloud import WordCloud

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA , TruncatedSVD
from sklearn.metrics import classification_report, confusion_matrix

from collections import defaultdict , Counter
import string
import re
import os

from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding , LSTM , Dense , SpatialDropout1D , Dropout
from keras.initializers import Constant
from keras.optimizers import Adam

from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb

plt.style.use('ggplot')
stop = set(stopwords.words('english'))


In [None]:
os.listdir()
#Training data
dataset = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
print('Training data shape: ', dataset.shape)
dataset.head()

In [None]:
dataset.isnull().sum()

In [None]:
dataset['target'].value_counts(normalize = True)

In [None]:
disaster_tweets = dataset[dataset.target == 1]['text']
disaster_tweets.values[1]

In [None]:
print('Number of rows in Training {}'.format(dataset.shape[0]))
print('Number of rows in Test {}'.format(test.shape[0]))

In [None]:
# Number of real and not real dataset in training

real = dataset[dataset.target == 1].shape[0]
not_real = dataset[dataset.target == 0].shape[0]
plt.figure(figsize = (8,5))
sns.barplot(x = ['Disaster' , 'Not Disaster']  , y = [real , not_real]).set(title = 'Proportion of Disaster and Not Disaster Tweets' , ylabel = 'Frequency')
plt.grid(alpha = 1)

In [None]:
dataset['length'] = dataset['text'].apply(lambda x : len(x))

In [None]:
dataset.length.head()

The distribution of length of tweet is skewed right, centered around 125 with most lengths between 90 to 140, a range of rough 120, and some outliers are present below 15 

In [None]:
plt.figure(figsize = (15,7))
sns.distplot(dataset[dataset.target == 1]['length'] , bins = 150 , color = 'blue' , kde = False , hist_kws = {'alpha':0.8})
sns.distplot(dataset[dataset.target == 0]['length'] , bins = 150, color = 'yellow',kde = False , hist_kws = {'alpha':0.4}).set(ylabel = 'Numbers' , xlabel = 'Length', title = 'Distribution of Lengths for both Real and Non real tweets')
plt.show()

In [None]:
plt.figure(figsize = (15,5))
plt.subplot(121)
plt.title('Disaster Tweets')
sns.distplot(dataset[dataset.target == 1]['length'] , kde = False , color= 'blue').set(title = 'Real Tweets')

plt.subplot(122)
sns.distplot(dataset[dataset.target == 0]['length'] , kde = False , color = 'red').set(title = 'Not Real Tweets')

plt.show()

In [None]:
plt.figure(figsize = (15,5))
plt.subplot(121)
sns.boxplot(dataset[dataset.target == 1]['length']  ,color= 'blue').set(title = 'Real Tweets')

plt.subplot(122)
sns.boxplot(dataset[dataset.target == 0]['length'] , color = 'red').set(title = 'Not Real Tweets')

plt.show()

Research Question: What is the average length for real tweet?

Target Population: Tweets 
Parameter of Interest: Real Tweets Length

In [None]:
mean1 = dataset[dataset.target == 1]['length'].mean()
mean2 = dataset[dataset.target == 0]['length'].mean()

std1 = dataset[dataset.target == 1]['length'].std()
std2 = dataset[dataset.target == 0]['length'].std()

len1 = dataset[dataset.target == 1]['length'].shape[0]
len2 = dataset[dataset.target == 0]['length'].shape[0]
((mean1 , std1 , len1) , (mean2 , std2 , len2))

###### From the given sample of data, with 95% confidence we estimate that the length of text for real tweet is between 107.10 to 109.11

In [None]:
Margin_of_error = 1.96* std1/np.sqrt(len1)

lcb = mean1 - Margin_of_error
ucb = mean1 + Margin_of_error
(lcb , ucb)

###### From the given sample of data, with 95% confidence we estimate that the length of text for non real tweet is between 94.63 to 96.77

In [None]:
Margin_of_error = 1.96* std2/np.sqrt(len2)

lcb = mean2 - Margin_of_error
ucb = mean2 + Margin_of_error
(lcb , ucb)

Research Question: Is there a significant difference between the length of text of real and not real tweet.

Hypotheses: 

$H_0$: mu1 - mu2 = 0

$H_1$: mu1 - mu2 != 0

1:real, 0:Not Real

alpha = 0.05, significance level


As we can see our p_val is less than 0.05, that means we have enough evidence to reject the NULL hypotheses and go with Alternative which states that there is clearly difference between the length of real and non real tweet

In [None]:
estimated_standard_error = np.sqrt(std1**2/len1 + std2**2/len2)
test_statistic = (mean1 - mean2)/estimated_standard_error
p_val = 2*dist.norm.cdf(-np.abs(test_statistic))
(test_statistic , p_val)

In [None]:
plt.figure(figsize = (12,5))
plt.subplot(121)
words = dataset[dataset.target == 1]['text'].str.split().map(lambda x : len(x))
sns.distplot(words , kde = False , color = 'blue')

plt.subplot(122)
words = dataset[dataset.target == 0]['text'].str.split().map(lambda x : len(x))
sns.distplot(words , kde = False , color = 'red')

In [None]:
plt.figure(figsize = (12,5))
plt.subplot(121)
words = dataset[dataset.target == 1]['text'].str.split().apply(lambda x : [len(i) for i in x]).map(lambda x : np.mean(x))
sns.distplot(words , color = 'blue' , hist_kws={'alpha':0.6} , kde = False)

plt.subplot(122)
words = dataset[dataset.target == 0]['text'].str.split().apply(lambda x : [len(i) for i in x]).map(lambda x : np.mean(x))
sns.distplot(words , color = 'red' , hist_kws = {'alpha': 0.6} , kde = False)

In [None]:
def create_corpus(target):
    corpus = []
    
    for x in dataset[dataset.target == target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

def create_corpus_df(tweet , target):
    corpus = []
    
    for x in tweet[tweet.target == target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

#### Analyze not real tweets

In [None]:
corpus = create_corpus(0)

dic = defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word] += 1
top = sorted(dic.items() , key = lambda x:x[1] , reverse=True)[:15]

plt.figure(figsize = (15,5))
x , y = zip(*top)
sns.barplot(list(x) , list(y)).set(title = 'Frequency of Top 15 words in non real tweets')

In [None]:
plt.figure(figsize = (15,5))
corpus = create_corpus(1)

dic = defaultdict(int)

for word in corpus:
    if word in stop:
        dic[word] += 1
top = sorted(dic.items() , key = lambda x:x[1] , reverse = True)[:15]

x, y = zip(*top)
sns.barplot(list(x) , list(y)).set(title = 'Frequency of Top 15 words for real tweets')
plt.show()

##### Analyze Punctuation

Let's first do it for real tweet

In [None]:
plt.figure(figsize = (12,6))
corpus = create_corpus(1)
dic = defaultdict(int)
punctuation = string.punctuation

for word in corpus:
    if word in punctuation:
        dic[word] += 1
        
top = sorted(dic.items() , key = lambda x:x[1] , reverse = True)

x,y = zip(*top)
sns.barplot(list(x) , list(y)).set(title = 'Barplot for punctuation in real tweet')

In [None]:
plt.figure(figsize = (12,6))
corpus = create_corpus(0)
dic = defaultdict(int)
punctuation = string.punctuation

for word in corpus:
    if word in punctuation:
        dic[word] += 1
        
top = sorted(dic.items() , key = lambda x:x[1] , reverse = True)

x,y = zip(*top)
sns.barplot(list(x) , list(y)).set(title = 'Barplot for punctuation in not real tweet')

In [None]:
def get_top_tweet_bigrams(corpus,n = None):
    vec = CountVectorizer(ngram_range=(2,2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_of_words = bag_of_words.sum(axis = 0)
    words_freq = [(word,sum_of_words[0,idx]) for word,idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq , key = lambda x:x[1] , reverse = True)
    return words_freq[:n]


In [None]:
plt.figure(figsize=(16,5))
top_tweet_bigrams=get_top_tweet_bigrams(dataset['text'])[:10]
x,y=map(list,zip(*top_tweet_bigrams))
sns.barplot(x=y,y=x)

In [None]:
def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

def combine_text(text):
    return ' '.join(text)

def text_preprocessing(text):
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    no_punc = clean_text(text)
    tokenized_text = tokenizer.tokenize(no_punc)
    remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = combine_text(remove_stopwords)
    
    return combined_text

In [None]:
dataset['text'] = dataset['text'].apply(lambda x: text_preprocessing(x))
test['text'] = test['text'].apply(lambda x: text_preprocessing(x))

In [None]:
plt.figure(figsize = (12,5))
sns.barplot(y = dataset[dataset.target == 1].keyword.value_counts()[:20].index, x = dataset[dataset.target == 1].keyword.value_counts()[:20])

In [None]:
plt.figure(figsize = (12,5))
sns.barplot(y = dataset[dataset.target == 0].keyword.value_counts()[:20].index, x = dataset[dataset.target == 0].keyword.value_counts()[:20])

In [None]:
disaster_tweets = dataset[dataset.target == 1]['text']
not_disaster_tweet = dataset[dataset.target == 0]['text']
from wordcloud import WordCloud

fig , (ax1 , ax2) = plt.subplots(1 , 2 , figsize = [26,8])
wordcloud1 = WordCloud(background_color = 'black' , width = 600 , height = 400).generate(" ".join(disaster_tweets))

ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Disaster Tweets' , fontsize = 40)

wordcloud2 = WordCloud(background_color = 'black' , height = 400 , width = 600).generate(" ".join(not_disaster_tweet))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Not Disaster Tweets' , fontsize = 40)

In [None]:
count_vectorizer = CountVectorizer()
train_vectors = count_vectorizer.fit_transform(dataset['text'])
test_vectors = count_vectorizer.transform(test['text'])

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df = 2, max_df = 0.5, ngram_range = (1 , 2))
train_tfidf = tfidf_vectorizer.fit_transform(dataset['text'])
test_tfidf = tfidf_vectorizer.transform(test['text'])

In [None]:
clf = LogisticRegression(C = 1.0)
score = model_selection.cross_val_score(clf , train_vectors , dataset['target'] , cv = 5 , scoring = 'f1')
score

In [None]:
clf.fit(train_vectors , dataset['target'])

In [None]:
clf_tfidf = LogisticRegression(C = 1.0)
scores = model_selection.cross_val_score(clf_tfidf , train_tfidf , dataset['target'] , cv = 5 , scoring = 'f1')
scores

In [None]:
clf_NB = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB , train_vectors , dataset['target'] , cv = 5 , scoring = 'f1')
scores

In [None]:
clf_NB.fit(train_vectors , dataset['target'])

In [None]:
clf_NB_tfidf = MultinomialNB()
scores = model_selection.cross_val_score(clf_NB_tfidf , train_tfidf , dataset['target'] , cv = 5 , scoring = 'f1')
scores

In [None]:
clf_NB_tfidf.fit(train_tfidf , dataset['target'])

In [None]:
clf_xgb = xgb.XGBClassifier(max_depth = 7 , n_estimators = 200 , colsample_bytree = 0.8 , subsample = 0.8 , nthread = 10 , learning_rate = 0.1)
scores = model_selection.cross_val_score(clf_xgb , train_vectors , dataset['target'] , cv = 5 , scoring = 'f1')
scores

In [None]:
clf_xgb_tfidf = xgb.XGBClassifier(max_depth = 7 , n_estimators = 200 , colsample_bytree = 0.8 , subsample = 0.8 , nthread = 10 , learning_rate = 0.1)
scores = model_selection.cross_val_score(clf_xgb_tfidf , train_tfidf , dataset['target'] , cv = 5 , scoring = 'f1')
scores

In [None]:
def submission(submission_file_path,model,test_vectors):
    sample_submission = pd.read_csv(submission_file_path)
    sample_submission["target"] = model.predict(test_vectors)
    sample_submission.to_csv("submission.csv", index=False)

In [None]:
submission_file_path = "../input/nlp-getting-started/sample_submission.csv"
test_vectors=test_tfidf
submission(submission_file_path,clf_NB_tfidf,test_vectors)