# Importing Libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#Text Preprocessing libraries
import nltk
nltk.download('stopwords')
import re 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
import en_core_web_sm
nlp = en_core_web_sm.load()


from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Suppress warnings 
import warnings
warnings.filterwarnings('ignore')

In [None]:
train=pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test=pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
submission=pd.read_csv('/kaggle/input/nlp-getting-started/sample_submission.csv')

In [None]:
#Training data set
train.head(10)

In [None]:
test.head(10)

In [None]:
train['text'][152]

# Exploratory Data Analysis (EDA)

Exploratory Data Analysis refers to the critical process of performing initial investigations on data so as to discover patterns,to spot anomalies,to test hypothesis and to check assumptions with the help of summary statistics and graphical representations.

Checking shape of train and test datasets. Note that the test dataset does not have 'target' column.

In [None]:
print('Shape of Training data:-',train.shape)
print('Shape of Test data:-',test.shape) 

Checking which all columns contain NaN values(is missing). 'location' is missing a lot in both the train and test data sets



In [None]:
#Null values
train.isnull().sum()

In [None]:
test.isnull().sum()

In [None]:
#train.drop(columns=['keyword','location'],axis='columns',inplace=True)
#test.drop(columns=['keyword','location'],axis='columns',inplace=True)

In [None]:
#My target values
train['target'].value_counts()

In [None]:
train.describe()

In [None]:
# prettier graphs!
plt.style.use('ggplot')

In [None]:
target_counts=train.target.value_counts()
sns.barplot(y=target_counts,x=target_counts.index)
plt.title("Counting the values in target column")
plt.ylabel('Sample')
plt.xlabel('Target')


In [None]:
my_labels=['Non-Disaster','Disaster']
my_color=['Blue','Green']
plt.figure(figsize=(15,7))
plt.pie(train['target'].value_counts(),labels=my_labels,colors=my_color,autopct='%1.1f%%')
plt.legend()
plt.show()

In [None]:
my_disaster_tweets=train[train['target']==1]['text']
my_disaster_tweets[:10]

In [None]:
non_disaster_tweets=train[train['target']==0]['text']
non_disaster_tweets[:10]

#WordCloud

In [None]:
plt.figure(figsize=(15,10))
wc=WordCloud(max_words=500,background_color='White',width = 1000 , height = 500 , stopwords = STOPWORDS).generate(" ".join(train[train.target==1].text))
plt.imshow(wc,interpolation='bilinear')

In [None]:
plt.figure(figsize=(15,10))
wc=WordCloud(max_words=500,background_color='White',width = 1000 , height = 500 , stopwords = STOPWORDS).generate(" ".join(train[train.target==0].text))
plt.imshow(wc,interpolation='bilinear')

Let's start by analysing total number of characters in text.

#Character Length

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(18,5))
char_len=train[train['target']==1]['text'].str.len()
ax1.hist(char_len,color='#db680f',edgecolor='black')
ax1.set_title('Disaster Tweets')
char_len2=train[train['target']==0]['text'].str.len()
ax2.hist(char_len2,color='#03639e',edgecolor='black')
ax2.set_title('Non-Disater Tweets')
plt.suptitle("Length of Characters in text",fontsize=20)
plt.show()

# Analysing number of words in text.

In [None]:
fig,(ax1,ax2)=plt.subplots(1,2,figsize=(18,5))
char_len=train[train['target']==1]['text'].str.split().map(lambda x: len(x))
ax1.hist(char_len,color='#c40a0d',edgecolor='black')
ax1.set_title('Disaster Tweets')
char_len2=train[train['target']==0]['text'].str.split().map(lambda x: len(x))
ax2.hist(char_len2,color='#0893a6',edgecolor='black')
ax2.set_title('Non-Disater Tweets')
plt.suptitle("Length of words in text",fontsize=20)
plt.show()

From the above histograms, it can be observed that the words count for disaster and non-disaster tweets are in the range of (15-20).

#Average word length

In [None]:
fig,(ax1,ax2) = plt.subplots(1,2,figsize=(18,5))
char_len_dis = train[train['target']==1]['text'].str.split().apply(lambda x:  [len(i) for i in x])
sns.distplot(char_len_dis.map(lambda x: np.mean(x)),ax=ax1,color='green')
ax1.set_title("Disaster Tweets")
char_len_ndis= train[train['target']==0]['text'].str.split().apply(lambda x:  [len(i) for i in x])
sns.distplot(char_len_ndis.map(lambda x: np.mean(x)),ax=ax2,color='red')
ax2.set_title("Non-Disaster Tweets")
plt.suptitle("Average word counts",fontsize=20)
plt.show()

From the above distributions, it can be observed that the average word count for disaster tweets are found to be in the range(7-7.5) while for non-disaster tweets are in the range of (4.5-5).

**Defaultdict** is a container like dictionaries present in the module collections. Defaultdict is a sub-class of the dict class that returns a dictionary-like object. 

"**Corpus** is a large collection of texts. It is a body of written or spoken material upon which a linguistic analysis is based. "

In [None]:
def sample_corpus(target):
  corpus=[]
  for x in train[train['target']==target]['text'].str.split():
    for i in x:
      corpus.append(i)
  return corpus

In [None]:
from collections import defaultdict

def stopwords_analysis(data,func,target):
  value_list=[]
  for labels in range(0,len(target)):
    dic=defaultdict(int)
    corpus = func(target[labels])

    for words in corpus:
      dic[words]+=1
    top=sorted(dic.items(),key=lambda x: x[1],reverse=True)[:20]
    x_items,y_values=zip(*top)
    value_list.append(x_items)
    value_list.append(y_values)

  #ploting the the figure
  fig,(ax1,ax2)=plt.subplots(1,2,figsize=(15,8))
  ax1.barh(value_list[0],value_list[1],color='b')
  ax1.set_title("Non-Disaster Tweets")

  ax2.barh(value_list[2],value_list[3],color='red')
  ax2.set_title("Disaster Tweets")

  plt.suptitle("Top Stop words in text")
  plt.show()

stopwords_analysis(train,sample_corpus,[0,1])

In [None]:
import string 

def punctuation_analysis(data,func,target):
    values_list = []
    special = string.punctuation
    for labels in range(0,len(target)):
        dic = defaultdict(int)
        corpus = func(target[labels])
        for i in corpus:
            if i in special:
                dic[i]+=1
        x_items,y_values = zip(*dic.items())
        values_list.append(x_items)
        values_list.append(y_values)

    #ploting the the figure
    fig,(ax1,ax2) = plt.subplots(1,2,figsize=(15,5))
    ax1.bar(values_list[0],values_list[1],color="b", linewidth=1.2)
    ax1.set_title("Non-Disaster Tweets")
    
    ax2.bar(values_list[2],values_list[3],color="red",edgecolor='black', linewidth=1.2)
    ax2.set_title("Disaster Tweets")
            
    plt.suptitle("Punctuations in text")
    plt.show()



punctuation_analysis(train,sample_corpus,[0,1])

The above Bar Charts displays the top 10 punctuations in tweets. From the bar chart, it is observed that the most occuring punctuation in both disaster/non-disaster tweets is "-"(350+) while the least occuring for non-disaster are "%","/:","$","_" and for disaster tweets is "=>", ")".

In [None]:
# Checking Null values
missing_train = train.isnull().sum()  
missing_test = test.isnull().sum()  
fig, (ax1,ax2) = plt.subplots(1,2,figsize=(15,5))
missing_train = missing_train[missing_train>0].sort_values()
ax1.pie(missing_train,autopct='%1.1f%%',startangle=30,explode=[0.9,0],labels=["keyword","location"],colors=['red','#afe84d'])
ax1.set_title("Null values present in Train Dataset")

missing_test = missing_test[missing_test>0].sort_values()
ax2.pie(missing_test,autopct='%1.1f%%',startangle=30,explode=[0.9,0],labels=["keyword","location"],colors=['Red','#6c1985'])
ax2.set_title("Null values present in Test Dataset")
plt.suptitle("Distribution of Null Values in Dataset")
plt.tight_layout()
plt.show()

In [None]:
!pip install contractions

In [None]:
stop_words=nltk.corpus.stopwords.words('english')
i=0
#sc=SpellChecker()
#data=pd.concat([train,test])
import contractions
from nltk.stem import SnowballStemmer
nltk.download('wordnet')
nltk.download('punkt')
wnl=WordNetLemmatizer()
stemmer=SnowballStemmer('english')
for doc in train.text:
  doc=re.sub(r'https?://\S+|www\.\S+','',doc)
  doc=re.sub(r'<.*?>','',doc)
  doc=re.sub(r'[^a-zA-Z\s]','',doc,re.I|re.A)
  #doc=' '.join([stemmer.stem(i) for i in doc.lower().split()])
  doc=' '.join([wnl.lemmatize(i) for i in doc.lower().split()])
  #doc=' '.join([sc.correction(i) for i in doc.split()])
  doc=contractions.fix(doc)
  tokens=nltk.word_tokenize(doc)
  filtered=[token for token in tokens if token not in stop_words]
  doc=' '.join(filtered)
  train.text[i]=doc
  i+=1
i=0
for doc in test.text:
  doc=re.sub(r'https?://\S+|www\.\S+','',doc)
  doc=re.sub(r'<.*?>','',doc)
  doc=re.sub(r'[^a-zA-Z\s]','',doc,re.I|re.A)
  #doc=' '.join([stemmer.stem(i) for i in doc.lower().split()])
  doc=' '.join([wnl.lemmatize(i) for i in doc.lower().split()])
  #doc=' '.join([sc.correction(i) for i in doc.split()])
  doc=contractions.fix(doc)
  tokens=nltk.word_tokenize(doc)
  filtered=[token for token in tokens if token not in stop_words]
  doc=' '.join(filtered)
  test.text[i]=doc
  i+=1

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(ngram_range=(1,1)) 

#    ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, 
#    and (2, 2) means only bigrams.

cv_matrix=cv.fit_transform(train.text).toarray()
train_df=pd.DataFrame(cv_matrix,columns=cv.get_feature_names())
test_df=pd.DataFrame(cv.transform(test.text).toarray(),columns=cv.get_feature_names())
train_df.head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(ngram_range=(1,1),use_idf=True)
mat=tfidf.fit_transform(train.text).toarray()
train_df=pd.DataFrame(mat,columns=tfidf.get_feature_names())
test_df=pd.DataFrame(tfidf.transform(test.text).toarray(),columns=tfidf.get_feature_names())
train_df.head()

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
model=LogisticRegression()
model.fit(train_df,train.target)
print(f1_score(model.predict(train_df),train.target))
pred=model.predict(test_df)

In [None]:
pd.DataFrame({
    'id':test.id,
    'target':pred
}).to_csv('submission.csv',index=False)