# Description of Dataset
Used dataset is consist of relevant and irrelevant tweets to a weather disaster.

# Summary
We aim to distinguish relavant tweets to a weather disaster. As a result later we will be able to use those tweets to predict a weather disaster, maybe as a weather forcasting agency. 
In this dataset we initially conducted a comprehensive EDA on the dataset.

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt

# Reading Data

In [None]:
# Reading train data
df_tweet_train = pd.read_csv('../input/nlp-getting-started/train.csv')

In [None]:
# Reading test data
df_tweet_test = pd.read_csv('../input/nlp-getting-started/test.csv')

# EDA

In [None]:
#Take a look at the dataset
df_tweet_train.head()

In [None]:
#shape of train dataframe
df_tweet_train.shape

In [None]:
#shape of test dataframe
df_tweet_test.shape

In [None]:
#gartting number of relevant and irrelevent tweets out of total 7,613 tweets in train dataset
df_tweet_train[['text', 'target']].groupby('target').count()

As we can see above it is an imbalanced dataset, as number f irrelevant tweets is considerably higher than relevant ones.

In [None]:
#adding a new column to the dataframe for the length of each tweet
df_tweet_train['tweet_len'] = df_tweet_train.text.apply(len)
df_tweet_train.head()

In [None]:
#Distribution of tweets length based on relevant/irrelevant fact
plt.figure(figsize=(10, 6))

df_tweet_train[df_tweet_train.target== 0].tweet_len.plot(bins=40, kind='hist', color='green', 
                                       label='irrelevant', alpha=0.6)
df_tweet_train[df_tweet_train.target==1].tweet_len.plot(bins=40,kind='hist', color='red', 
                                       label='relevant', alpha=0.6)
plt.legend()
plt.xlabel("text Length")

As we can notice in the above plot relevant tweets are usually longer than irrelevant tweets. 

#  Text Pre-processing

In [None]:
#The following function will remove all stopwords (defined in the list of english stopwords in nltk) and punctuations from the text 
import string

from nltk.corpus import stopwords

def text_process(text):
    """
    Takes in a string of text, then performs the following:
    Vectorization1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    STOPWORDS = stopwords.words('english')
    # making a list of characters of the text, excluding punctuations (!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~)
    nopunc = [char for char in text if char not in string.punctuation]

    # Join the characters with no space in between to form the text (excluding punctuations) again.
    nopunc = ''.join(nopunc)
    
    # splitting string nopunc with spaces and making all words lowercase then
    #check if the word exsists in STOPWORDS collection, if not join those words with space in between.
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

In [None]:
#making a new column in the dataframe applying the text-preprocessing function to the "text" column.
df_tweet_train['clean_txt'] = df_tweet_train.text.apply(text_process)
df_tweet_test['clean_txt'] = df_tweet_test.text.apply(text_process)
df_tweet_train.head()

# Vectorization

In [None]:
# defining X (input) and y (label) from the dataframe columns for later use in COUNTVECTORIZER
X_train = df_tweet_train.clean_txt.values
y_train = df_tweet_train.target.values
X_test = df_tweet_test.clean_txt.values
# y_test = df_tweet_test.target.values
#shape and dimension of X and y arrays
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

Now we need to convert text documents to a matrix of token counts


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
# instantiate the vectorizer object (content vectorizer) and training (fitting) that on the train dataset
vect = CountVectorizer()
vect.fit(X_train)

In [None]:
#look at the “vocabulary” also called the “dictionary” for the whole representation
vect.vocabulary_

In [None]:
# learn training data vocabulary, then use it to create a document-term matrix
X_train_dtm = vect.transform(X_train)
# examine the document-term matrix
X_train_dtm

This train dtm matrix contains 7613 train articles samples (rows) and 22,310 vocabs (columns). Its data type is integer, meaning all 0,1,2 (if two of a specific vocab in a text)..


In [None]:
# transform testing data (using fitted vocabulary) into a document-term matrix
X_test_dtm = vect.transform(X_test)
X_test_dtm

This test dtm matrix contains 3263 test articles samples (rows) and 22,310 fitted vocabulary (columns). 

# Modeling

In [None]:
# import and instantiate a Multinomial Naive Bayes model
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [None]:
from sklearn import model_selection

In [None]:
# Accuracy based on cross validation.
accuracy = model_selection.cross_val_score(nb, X_train_dtm, df_tweet_train["target"], cv=3, scoring="accuracy")
accuracy

In [None]:
#Average accuracy of nb model in cross validation
np.average(accuracy)

In [None]:
# prediction based on cross validation.
from sklearn.metrics import confusion_matrix
y_pred = model_selection.cross_val_predict(nb, X_train_dtm, df_tweet_train["target"], cv=3)
# confusuin matrix based on cross validation.
conf_mat = confusion_matrix(y_train, y_pred)
conf_mat

In [None]:
# Fit Naive Bayes classifier according to X, y
nb.fit(X_train_dtm, y_train)

In [None]:
# make class predictions for X_test_dtm
y_pred_class = nb.predict(X_test_dtm)
#The first 10 predictions
y_pred_class[:10]

In [None]:
# calculate AUC
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train, y_pred)

In [None]:
sample_submission = pd.read_csv("../input/nlp-getting-started/sample_submission.csv")

In [None]:
sample_submission["target"] = nb.predict(X_test_dtm)

In [None]:
sample_submission.iloc[::5]

In [None]:
sample_submission.to_csv("submission.csv", index=False)