In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

import nltk
import re
import string
from nltk.corpus import stopwords
import gensim
from gensim import parsing
from wordcloud import WordCloud,STOPWORDS

from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


In [None]:
train= pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')

In [None]:
test=pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

In [None]:
train.head()

In [None]:
train.info()

In [None]:
train.isnull().sum()

In [None]:
train.drop(['keyword', 'location'],axis=1, inplace=True)

In [None]:
train['target'].value_counts()

**Data Visualization**

In [None]:
plot=sns.countplot(train['target'])
plot.set_title("Count of disaster and non disaster tweets")

The dataset is balanced.

In [None]:
mylabels=["Non-Disaster", "Disaster"]
mycolors=['pink', 'blue']
plt.pie(train['target'].value_counts(), labels=mylabels, colors=mycolors,autopct='%1.1f%%')
plt.legend()
plt.show()

**WordCloud for Disaster Tweets**

In [None]:
plt.figure(figsize = (15,15))
wc = WordCloud(max_words = 500 , width = 1000 , height = 500 , stopwords = STOPWORDS).generate(" ".join(train[train.target == 1].text))
plt.imshow(wc , interpolation = 'bilinear')

**WordCloud for Non-Disaster Tweets**

In [None]:
plt.figure(figsize = (15,15))
wc = WordCloud(max_words = 500 , width = 1000 , height = 500 , stopwords = STOPWORDS).generate(" ".join(train[train.target == 0].text))
plt.imshow(wc , interpolation = 'bilinear')

**Data Preprocessing**

In [None]:
def transformText(text):
  # All the necessary preprocessing on our text of choice
    stops = set(stopwords.words("english"))
  # Convert text to lower
    text = text.lower()
  # Removing non ASCII chars    
    text = re.sub(r'[^\x00-\x7f]',r' ',text) 
    text = re.sub('\[[^]]*\]', '', text)
    text = re.sub('http','',text)
    text= gensim.parsing.preprocessing.strip_non_alphanum(text)                       
  # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
  # Removing all the stopwords
    filtered_words = [word for word in text.split() if word not in stops]
  # Removing all the tokens with lesser than 3 characters
    filtered_words = gensim.corpora.textcorpus.remove_short(filtered_words, minsize=3)
  # Preprocessed text after stop words removal
    text = " ".join(filtered_words)
  # Remove the punctuation
    text = gensim.parsing.preprocessing.strip_punctuation2(text)
  # Strip all the numerics
    text = gensim.parsing.preprocessing.strip_numeric(text)
  # Strip multiple whitespaces
    text = gensim.corpora.textcorpus.strip_multiple_whitespaces(text)
  # Stemming
    return gensim.parsing.preprocessing.stem_text(text)

In [None]:
train['text']=train['text'].apply(transformText)

In [None]:
train.head(10)

**N-Gram Analysis**

In [None]:
texts=''.join(train['text'])
words=texts.split(" ")

In [None]:
def draw_n_gram(words,i):
    n_gram=(pd.Series(nltk.ngrams(words,i)).value_counts())[:15]
    n_gram_df=pd.DataFrame(n_gram)
    n_gram_df=n_gram_df.reset_index()
    n_gram_df = n_gram_df.rename(columns={"index": "word", 0: "count"})
    print(n_gram_df.head())
    plt.figure(figsize = (16,9))
    return sns.barplot(x='count',y='word', data=n_gram_df)


**Unigram Analysis**

In [None]:
draw_n_gram(words,1)

**Bi-gram Analysis**

In [None]:
draw_n_gram(words,2)

**Tri-gram Analysis**

In [None]:
draw_n_gram(words,3)

In [None]:
X=train['text']
y=train['target']

**Splitting data into train and test set**

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30, random_state=1)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

# Feature Extraction with Count Vectorizer and TfidfTransformer(Term Frequency-Inverse Document Frequency)

In [None]:
vectorizer=CountVectorizer()
transformer=TfidfTransformer()

**For train data**

In [None]:
X_train_vect=vectorizer.fit_transform(X_train)
X_train_trans=transformer.fit_transform(X_train_vect)

In [None]:
X_train_trans.toarray()

**For test data**

In [None]:
X_test_vect=vectorizer.transform(X_test)
X_test_trans=transformer.transform(X_test_vect)

In [None]:
X_test_trans.toarray()

# Building Model 

In [None]:
lr=LogisticRegression()
dtc= DecisionTreeClassifier()
rfc= RandomForestClassifier()
svm= SVC()
knn= KNeighborsClassifier()
nb= GaussianNB()

In [None]:
lr.fit(X_train_trans,y_train)
dtc.fit(X_train_trans,y_train)
rfc.fit(X_train_trans ,y_train)
svm.fit(X_train_trans ,y_train)
knn.fit(X_train_trans ,y_train)
#nb.fit(X_train_trans ,y_train)

**Making Predictions**

In [None]:
predict_lr = lr.predict(X_test_trans)
predict_dtc = dtc.predict(X_test_trans)
predict_rfc = rfc.predict(X_test_trans)
predict_svm = svm.predict(X_test_trans)
predict_knn = knn.predict(X_test_trans)

**Checking model performance**

In [None]:
acc_1=accuracy_score(predict_lr,y_test)
print("Accuracy of LogisticRegression = " +str(acc_1))
acc_2=accuracy_score(predict_dtc,y_test)
print("Accuracy of DecisionTreeClassifier = " +str(acc_2))
acc_3 =  accuracy_score(predict_rfc,y_test)
print("Accuracy of RandomForestClassifier = " +str(acc_3))
acc_4 = accuracy_score(predict_svm,y_test)
print("Accuracy of SupportVectorClassifier = " +str(acc_4))
acc_5 = accuracy_score(predict_knn,y_test)
print("Accuracy of KNearestNeighbor = " +str(acc_5))

**Here, we see that SupportVectorClassifier gives the best accuracy score of 80.47%**

**Now, we predict for test data using SVC**

In [None]:
X_test=test['text']
X_test.head()

In [None]:
X_test_vec=vectorizer.transform(X_test)
X_test_tran=transformer.transform(X_test_vec)

In [None]:
X_test_tran.toarray()

In [None]:
predictions= svm.predict(X_test_tran)
predictions

In [None]:
submission = pd.DataFrame()

In [None]:
submission['id']= test['id']
submission['target']= predictions

In [None]:
submission['target'].value_counts()

In [None]:
submission.to_csv('solution.csv')