### Import libraries

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.util import ngrams
from nltk.tokenize import word_tokenize



In [None]:
import re
import os
from tqdm import tqdm

import string
from collections import defaultdict
from collections import  Counter

In [None]:
import nltk
# nltk.download('stopwords')

In [None]:
plt.style.use('ggplot')
stop=set(stopwords.words('english'))

### Basic EDA 

In [None]:
tweets_train = pd.read_csv('data/train.csv')
tweets_test = pd.read_csv('data/test.csv')
tweets_train.head(3)

In [None]:
print('There are {} rows and {} columns in train'.format(tweets_train.shape[0],tweets_train.shape[1]))
print('There are {} rows and {} columns in train'.format(tweets_test.shape[0],tweets_test.shape[1]))

#### Class distribution

In [None]:
x=tweets_train.target.value_counts()
sns.barplot(x.index,x)
plt.gca().set_ylabel('samples')

#### Number of characters in tweets

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len=tweets_train[tweets_train['target']==1]['text'].str.len()
ax1.hist(tweet_len,color='red')
ax1.set_title('disaster tweets')
tweet_len=tweets_train[tweets_train['target']==0]['text'].str.len()
ax2.hist(tweet_len,color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Characters in tweets')
plt.show()

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
tweet_len=tweets_train[tweets_train['target']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(tweet_len,color='red')
ax1.set_title('disaster tweets')
tweet_len=tweets_train[tweets_train['target']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(tweet_len,color='green')
ax2.set_title('Not disaster tweets')
fig.suptitle('Words in a tweet')
plt.show()

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(10,5))
word=tweets_train[tweets_train['target']==1]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax1,color='red')
ax1.set_title('disaster')
word=tweets_train[tweets_train['target']==0]['text'].str.split().apply(lambda x : [len(i) for i in x])
sns.distplot(word.map(lambda x: np.mean(x)),ax=ax2,color='green')
ax2.set_title('Not disaster')
fig.suptitle('Average word length in each tweet')


#### Common stopwords in tweets

In [None]:
def create_corpus(target):
    corpus=[]
    
    for x in tweets_train[tweets_train['target']==target]['text'].str.split():
        for i in x:
            corpus.append(i)
    return corpus

In [None]:
corpus=create_corpus(0)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1
        
top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 

In [None]:
x,y=zip(*top)
plt.bar(x,y)

In [None]:
corpus=create_corpus(1)

dic=defaultdict(int)
for word in corpus:
    if word in stop:
        dic[word]+=1

top=sorted(dic.items(), key=lambda x:x[1],reverse=True)[:10] 
    


x,y=zip(*top)
plt.bar(x,y)

#### punctuation analysis

In [None]:
plt.figure(figsize=(10,5))
corpus=create_corpus(1)

dic=defaultdict(int)
import string
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i]+=1
        
x,y=zip(*dic.items())
plt.bar(x,y)

In [None]:
plt.figure(figsize=(10,5))
corpus=create_corpus(0)

dic=defaultdict(int)
import string
special = string.punctuation
for i in (corpus):
    if i in special:
        dic[i]+=1
        
x,y=zip(*dic.items())
plt.bar(x,y,color='green')

### Data Cleaning 

More in modelling python notebooks

In [None]:
# !pip install pyspellchecker

from spellchecker import SpellChecker

spell = SpellChecker()

def correct_spellings(text):
    corrected_text = []
    misspelled_words = spell.unknown(text.split())
    for word in text.split():
        if word in misspelled_words:
            corrected_text.append(spell.correction(word))
        else:
            corrected_text.append(word)
    return " ".join(corrected_text)
        
text = "corect me plese"
correct_spellings(text)

#### N-gram analysis

In [None]:
def get_top_tweet_ngrams(corpus, n=None, start_range = 1 , end_range = 1):
    """ 
    This function performs n-gram analysis by taking out stop words. Arguments for function are as follows:
    corpus : pandas series data
    start_range : ngram_range starting value
    end_range : ngram_range ending value
    n : number of words to return
    """
    vec = CountVectorizer(ngram_range=(2, 2), stop_words = stop).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
plt.figure(figsize=(10,5))
top_tweet_bigrams=get_top_tweet_ngrams(tweets_train['text'] , start_range = 2, end_range = 2)[:10]
x,y=map(list,zip(*top_tweet_bigrams))
sns.barplot(x=y,y=x)
plt.title('Top bigrams in the train text')

In [None]:
def get_top_tweet_ngrams_full(corpus, n=None, start_range = 1 , end_range = 1):
    """ 
    This function performs n-gram analysis on original data without taking out stop words. 
    Arguments for function are as follows:
    corpus : pandas series data
    start_range : ngram_range starting value
    end_range : ngram_range ending value
    n : number of words to return
    """
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
plt.figure(figsize=(10,5))
top_tweet_bigrams=get_top_tweet_ngrams_full(tweets_train['text'], start_range = 2 , end_range = 2)[:10]
x,y=map(list,zip(*top_tweet_bigrams))
sns.barplot(x=y,y=x)