In [None]:
pip install beautifulsoup4

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import string
# Natural Language tool kit
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from bs4 import BeautifulSoup

In [None]:
train_df = pd.read_csv('../input/nlp-getting-started/train.csv')
test_df = pd.read_csv('../input/nlp-getting-started/test.csv')

### Exploring the dataset

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
sns.heatmap(train_df.isnull())

In [None]:
total = train_df.shape[0]

we can see that most of the Null values are from Location and few are from keyword 

In [None]:
train_df.isnull().sum()/(train_df.shape[0])

we can see that 0.33% of Null values are missing which is huge amount which will make huge prediction errors, also location data might not be as useful so we can drop the data

In [None]:
train_df.drop(['location'],axis=1,inplace=True)

In [None]:
p = train_df['target'].value_counts()/(train_df.shape[0])

In [None]:

ax = sns.countplot(y = 'target',data=train_df)

for p in ax.patches:
    percent = '{:.1f}%'.format(100*(p.get_width()/total))
    x = p.get_x()+p.get_width()
    y = p.get_y()+(p.get_height())/2
    ax.annotate(percent,(x,y))


we can see that data is balanced

In [None]:
train_df[train_df['target'] == 0]

In [None]:
train_df['keyword'].dropna(inplace=True)

we can use text column to train the model, the words inside the text can indicate it is a disaster or not so we are using countvector to count the words in the tweet and turn them into data

### Next Step is Preprocessing of data in NLP.

Preprocessing of data might vary for each use cases in NLP.here we can see from the above special characters, email id and twitter user name with many complex string are given to process.We first get data between a email like

" @shawn Titanic tragedy could have been prevented Economic Times: Telegraph.co.ukTitanic tragedy could have been preve... http://bet.ly/tuN2wx"

we have to get string like "Titanic tragedy could have been prevented Economic Times Telegraph co ukTitanic tragedy could have been preve"

In [None]:
import re


In [None]:
## Removing URL's
def remove_url(text):
    newtext=" ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
    return newtext

Since the model can't classify strings having special charecter and emails. we are using regular expresion to remove

'(@[A-Za-z0-9]+)' indicates the string starts with @ and the string should contain Capital, small, number.'+' indicates it canbe one or more occurance of [A-Za-z0-9] thus giving @bbcmtd

'([^0-9A-Za-z \t])' indicates the string should not contain small letter, capital letter, and numbers and '\t' indicates Tabs.THus giving all the special characters like '.,:,-' etc.

'(\w+:\/\/\S+)' '\w' indicates the string should have the character of letters and numbers,'+' indicates one or more characters,followed by :'\' indicates escape which escapes / and followed by similar patter then '\S' indicates one non whitspace character.'+' indicatesz one or more of '\S' thus giving http://t.co/lHYXEOHY6C'

in order to get ms with formating we are replacing all the above regular exxpression answers with " " by using re.sub.

spliting would give the letters in list and " ".join will give sting with space. 


In [None]:
## Making text to Lower case
def lower_case(text):
    return text.lower()

def butiful(text):
    text = BeautifulSoup(text).get_text()
    return text

## Removing numbers
def remove_num(text):
    newtext= re.sub(r'\d+',"",text)
    return newtext

## removing punctuation
def remove_punc(text):
    trans = str.maketrans("","",string.punctuation)
    return text.translate(trans)


### Preprocessing NLP methods

The most common preprocessing methods are:

> Tokenize (tokenizing means spliting string into words)

> Lemmatize (Lemmatizeing given bu eg: if we have a word (learn,learning) learning can be lemmatized to learn.)

> Removing stop words (stop wordsa are joining words used to join the text like(is,was,and,or etc.))

In [None]:
#Tokenizing
def tokenize(text):
    text = word_tokenize(text)
    return text

# Removing Stop Words
def remove_stop(text):
    text  = [i for i in text if not i in stopwords.words('english')]
    return text

#Lemmatize

word_lem = WordNetLemmatizer()
#we are using recursive call since for eg: calling can be writen as (called,caller,call). So we are lemmatizing into the word call
def Lemmatize(text):
    text = [word_lem.lemmatize(token) for token in text]
    return text

# Preprocessing

def preprocess(text):
    text = remove_url(text)
    text = butiful(text)
    text = lower_case(text)
    text = remove_num(text)
    text = remove_punc(text)
    text = tokenize(text)
    text = remove_stop(text)
    text = Lemmatize(text)
    text = " ".join(text)
    return text

### Train Preprocessing

In [None]:
train_preprocess = []
for i in train_df['text']:
    text_data = preprocess(i)
    train_preprocess.append(text_data)
train_df['processed_text'] = train_preprocess

the below are downloaded since it has shown error in module

In [None]:
nltk.download('punkt')

In [None]:
nltk.download('stopwords')

In [None]:
nltk.download('wordnet')

### Test Data Preprocessing

In [None]:
a = test_df.shape
b = test_df.isnull().sum()
c = test_df.info()
print(a,b,c)

In [None]:
test_df.drop('location',axis=1,inplace=True)

In [None]:
test_df['keyword'].dropna(inplace=True)

In [None]:
test_preprocess = []
for i in test_df['text']:
    text_data_test = preprocess(i)
    test_preprocess.append(text_data_test)
test_df['processed_text'] = test_preprocess

In [None]:
train_df.head()

In [None]:
test_df.head()

Now we can vectorize the text data 

###### first we are using count vectorizer

In [None]:
from sklearn import feature_extraction,linear_model,preprocessing,model_selection
count_vect = feature_extraction.text.CountVectorizer()
ex_train_vec = count_vect.fit_transform(train_df['processed_text'])
ex_test_vec = count_vect.transform(test_df['processed_text'])

In [None]:
ex_train_vec.todense().shape

In [None]:
ex_test_vec.todense().shape

we can see there are 12840 unique text

In [None]:
model = linear_model.RidgeClassifier()

In [None]:
score = model_selection.cross_val_score(model,ex_train_vec,train_df['target'],cv =3)

In [None]:
score

##### let Solve using tf-idf vectorisation

https://www.etutorialspoint.com/index.php/386-tf-idf-tfidfvectorizer-tutorial-with-examples

According to scikit-learn’s website, TfidfVectorizer is actually CountVectorizer followed by TfidfTransformer. CountVectorizer first takes our text documents and tokenizes them, as we did before (but then un-did because this function does not accept tokenized data as input). Once the data have been tokenized, CountVectorizer assembles a bag of words consisting of every unique token and assigning each a number. Finally, CountVectorizer represents the tokenized text data as a matrix of token counts, which looks like this:

This image shows the first six rows of the CountVectorizer matrix. These rows tell us that in document 0, the words 368, 3403, 4146, 5201, 8491, and 11223 all appear once. We are interested in these counts because if a word appears many times in a document, that word is probably very significant.
TfidfTransformer simply transforms this matrix of token counts to a term frequency-inverse document frequency (tf-idf) representation. Using tf-idf is important because simply using token counts can be misleading. Previously, we assumed that if a word appeared many times in a document, it was important. What if that word is extremely common in the entire corpus? Then it’s high frequency in our current document would be less significant, because the word appears so frequently elsewhere.
Tf-idf strikes a balance by taking the term frequency (basically the count) and multiplying it by the inverse document frequency (1/document frequency). This means that if word 1 appears once in document A but also once in the total corpus, while word 2 appears four times in document A but 16 in the total corpus, word 1 will have a tf-idf score of 1.0 while word 2 will only receive a score of 0.25. Word 2’s importance in document A is diluted by its high frequency in the corpus. (This is a simplified explanation of the actual tf-idf equation, which is more complicated.)
Hence, we arrive at this representation of document 0:

Notice that in the CountVectorizer representation, all the tokens in document 1 appeared only once. Now, in the tf-idf representation, some tokens have higher scores than others. Tf-idf has added a layer of nuance to our data.

Lets combine test_df['processed_text'] and train_df['processed_text'] into corpus.

by combining we can get the propotion of most occurance of the text 

In [None]:
train_list = list(train_df['processed_text'])
test_list = list(test_df['processed_text'])
corpus = train_list+test_list

In [None]:
corpus[1:5]

In [None]:
"""from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
fit_vec = tfidf.fit(corpus)
train_trasform = tfidf.transform(train_df['processed_text'])
y = train_df['target']
test_transform = tfidf.transform(test_df['processed_text'])"""

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count = CountVectorizer()
fit_count = count.fit(corpus)
train_transform = count.transform(train_df['processed_text']).toarray()
y = train_df['target']
test_transform = count.transform(test_df['processed_text']).toarray()

##### Logistic Regression

In [None]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
reg = LogisticRegression()
cross_val = model_selection.cross_val_score(reg,train_transform,y,cv=3)

In [None]:
cross_val

In [None]:
reg.fit(train_transform,y)
predict = reg.predict(test_transform) 

In [None]:
sub = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
sub['target'] = predict

In [None]:
sub.to_csv('submission.csv',index=False)