In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import xgboost as xgb
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection

In [None]:
# load dataset in notebook by giving path
train_path = '../input/nlp-getting-started/train.csv'
test_path ='../input/nlp-getting-started/test.csv'
submission_path = '../input/nlp-getting-started/sample_submission.csv'

In [None]:
# read dataset
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
submission_sample = pd.read_csv(submission_path)

In [None]:
# first 5 lines of train dataset
train_df.head(5)

In [None]:
# first 5 lines of test dataset
test_df.head(5)

In [None]:
# first 5 lines of submission sample dataset
submission_sample.head(5)

# Exploratory Data Analysis

In [None]:
# shape of dataset
print("Total number of rows in train dataset are ",train_df.shape[0],'and total number of columns in train dataset are',train_df.shape[1])
print("Total number of rows in test dataset are ",test_df.shape[0],'and total number of columns in test dataset are',test_df.shape[1])

In [None]:
# basic info of train dataset
train_df.info()

In [None]:
#some basic info of test data
test_df.info()

In [None]:
#null values in train dataset
train_df.isnull().sum()

In [None]:
#null values in test dataset
test_df.isnull().sum()

In [None]:
train_df.isna().sum().plot(kind="bar")
plt.title("no of null values in train data")
plt.show()

In [None]:
test_df.isna().sum().plot(kind="bar")
plt.title("no of null values in test data")
plt.show()

**We do not need location and keyword columns.So we are going to drop these two columns**

In [None]:
# drop location and keyword column
train_df = train_df.drop(['location','keyword'],axis=1)
test_df = test_df.drop(['location','keyword'],axis=1)

In [None]:
# train dataset after dropping location and keyword columns
train_df.head()

In [None]:
# test dataset after dropping location and keyword columns
test_df.head()

In [None]:
# finding percentage of 0 and 1 target
real_tweets = len(train_df[train_df["target"] == 1])
real_tweets_percentage = real_tweets/train_df.shape[0]*100
fake_tweets_percentage = 100-real_tweets_percentage

#print
print("Real tweets percentage: ",real_tweets_percentage)
print("Fake tweets percentage: ",fake_tweets_percentage)

In [None]:
# plot of traget values
sns.countplot(x='target',data=train_df)

In [None]:
length_train = train_df['text'].str.len() 
length_test = test_df['text'].str.len() 
plt.hist(length_train, label="train_tweets") 
plt.hist(length_test, label="test_tweets") 
plt.legend() 
plt.show()

In [None]:
# disaster tweets
disaster_tweets = train_df[train_df['target'] ==1 ]['text']
for i in range(1,10):
    print(disaster_tweets[i])

In [None]:
# non-disaster tweets
non_disaster_tweets = train_df[train_df['target'] !=1 ]['text']

In [None]:
# word cloud of disaster and non-disaster tweets
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[20, 5])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(disaster_tweets))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Disaster Tweets',fontsize=40);

wordcloud2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(non_disaster_tweets))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Non Disaster Tweets',fontsize=40);

# Data Cleaning

In [None]:
# cleaning the text

def clean_text(text):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

# Applying the cleaning function to both test and train datasets
train_df['text'] = train_df['text'].apply(lambda x: clean_text(x))
test_df['text'] = test_df['text'].apply(lambda x: clean_text(x))

# updated text
train_df['text'].head()

In [None]:
tokenizer=nltk.tokenize.RegexpTokenizer(r'\w+')
train_df['text'] = train_df['text'].apply(lambda x:tokenizer.tokenize(x))
test_df['text'] = test_df['text'].apply(lambda x:tokenizer.tokenize(x))
train_df['text'].head()

# Stopwords

Stopwords are those english words which do not add much meaning to a sentence.They are very commonly used words and we do not required those words.
So we can remove those stopwords

In [None]:
# stopwords
stopwords.words('english')

In [None]:
len(stopwords.words('english'))

In [None]:
# removing stopwords
def remove_stopwords(text):
    words = [w for w in text if w not in stopwords.words('english')]
    return words 
train_df['text'] = train_df['text'].apply(lambda x : remove_stopwords(x))
test_df['text'] = test_df['text'].apply(lambda x : remove_stopwords(x))
test_df.head()

# Lemmatization 

Lemmatization is the process of grouping  together the different inflected forms of a word so they can be analyzed as a single item. 

Examples of lemmatization:

1.**playing** ,**plays** and **played** all these 3 letters will be converted to **play** after lemmatization

2.**change** , **changing** , **changes** , **changed** and **changer** all these letters will be converted to **change** after lemmatization

In [None]:
# lemmatization
lem = WordNetLemmatizer()
def lem_word(x):
    return [lem.lemmatize(w) for w in x]

In [None]:
train_df['text'] = train_df['text'].apply(lem_word)
test_df['text'] = test_df['text'].apply(lem_word)

In [None]:
train_df['text'][:10]

In [None]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

train_df['text'] = train_df['text'].apply(lambda x : combine_text(x))
test_df['text'] = test_df['text'].apply(lambda x : combine_text(x))
train_df['text']
train_df.head()

# Count-Vector

CountVectorizer is used to transform a given text into a vector on the basis of the frequency(count) of each word that occurs in the entire text.It involves counting the number of occurences each words appears in a document(text) 

In [None]:
count_vectorizer = CountVectorizer()
train_vector = count_vectorizer.fit_transform(train_df['text'])
test_vector = count_vectorizer.transform(test_df['text'])
print(train_vector[0].todense())

# TF-IDF

It stands for Term Frequency-Inverse document frequency.It is a techinque to quantify a word in documents,we generally compute a weight to each word which signifies the importance of the word which signifies the importance of the word in the document and corpus

In [None]:
tfidf = TfidfVectorizer(min_df = 2,max_df = 0.5,ngram_range = (1,2))
train_tfidf = tfidf.fit_transform(train_df['text'])
test_tfidf = tfidf.transform(test_df['text'])

# XGB Classifier

In [None]:
xgb_param = xgb.XGBClassifier(max_depth=5,n_estimators=500,colsample_bytree=0.8,nthread=10,learning_rate=0.05)

In [None]:
scores_vector = model_selection.cross_val_score(xgb_param,train_vector,train_df['target'],cv=5,scoring='f1')
scores_vector

In [None]:
scores_tfidf = model_selection.cross_val_score(xgb_param,train_tfidf,train_df['target'],cv=5,scoring='f1')
scores_tfidf

In [None]:
xgb_param.get_params()

# MultiNomial Naive Bayes

In [None]:
mnb = MultinomialNB(alpha = 2.0)
scores_vector = model_selection.cross_val_score(mnb,train_vector,train_df['target'],cv = 10,scoring = 'f1')
print("score:",scores_vector)
scores_tfidf = model_selection.cross_val_score(mnb,train_tfidf,train_df['target'],cv = 10,scoring = 'f1')
print("score of tfidf:",scores_tfidf)

In [None]:
mnb.get_params()

# Logistic Regression

In [None]:
lg = LogisticRegression(C = 1.0)
scores_vector = model_selection.cross_val_score(lg, train_vector, train_df["target"], cv = 5, scoring = "f1")
print("score:",scores_vector)
scores_tfidf = model_selection.cross_val_score(lg, train_tfidf, train_df["target"], cv = 5, scoring = "f1")
print("score of tfidf:",scores_tfidf)

In [None]:
lg.get_params()

# Prediction

In [None]:
mnb.fit(train_tfidf, train_df["target"])
y_pred = mnb.predict(test_tfidf)

In [None]:
y_pred

**Now we will prepare submission file**

# Submission

In [None]:
submission_df2 = pd.DataFrame({'Id':test_df['id'],'target':y_pred})

In [None]:
submission_df2.to_csv('submission_df2.csv',index=False)

In [None]:
submission_df2 = pd.read_csv('submission_df2.csv')

In [None]:
submission_df2.head()