# Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer

In [None]:
train_dataset = pd.read_csv('../input/nlp-getting-started/train.csv')
test_dataset = pd.read_csv('../input/nlp-getting-started/test.csv')

# Dataset

In [None]:
train_dataset.head()

In [None]:
train_dataset.shape

> It is seen that `keyword` has only few **NULL** value where `location` has quite a lot of **NULL** values.

In [None]:
train_dataset.info()

In [None]:
train_dataset.fillna("", inplace= True)
test_dataset.fillna("", inplace= True)

In [None]:
train_dataset.head()

In [None]:
test_dataset.head()

# Data Visualization

- **1 = disastrous tweet**
- **0 = not disastrous tweet**

In [None]:
train_dataset['target'].value_counts()

In [None]:
plt.figure(figsize=(15,7))
sns.countplot(train_dataset['target'])

In [None]:
train_dataset['location'].value_counts()

In [None]:
train_dataset['keyword'].value_counts()

# NLP Processing and Data Cleaning

In [None]:
dataset = pd.DataFrame()
test_dataset_cleaned = pd.DataFrame()
dataset['all_combined'] = train_dataset['keyword'] + " " + train_dataset['location'] + " " + train_dataset['text']
test_dataset_cleaned['all_combined'] = test_dataset['keyword'] + " " + test_dataset['location'] + " " + test_dataset['text']

print(dataset.shape)
test_dataset_cleaned.shape
#dataset.tail(100)

In [None]:
def clean(data):
    data = data.lower()
    data = re.sub('https?://\S+|www\.\S+', ' ', data)
    data = re.sub('\\W', ' ', data)
    data = re.sub('\n', ' ', data)
    data = re.sub(' +', ' ', data)
    data = re.sub('^ ', ' ', data)
    data = re.sub(' $', ' ', data)
    data = re.sub('#', ' ', data)
    data = re.sub('@', ' ', data)
    data = re.sub('[^a-zA-Z]',' ', data)
    return data

In [None]:
dataset['all_cleaned'] = dataset['all_combined'].astype(str).apply(clean)
test_dataset_cleaned['all_cleaned'] = test_dataset_cleaned['all_combined'].astype(str).apply(clean)

In [None]:
dataset['target'] = train_dataset['target']

In [None]:
dataset.head(100)

In [None]:
test_dataset_cleaned.tail(100)

## Stopwords Processing

In [None]:
print(stopwords.words('english'))

In [None]:
stop = set(stopwords.words('english'))
def remove_stopwords(data):
    words = [word for word in data if word not in stop]
    words= "".join(words).split()
    words= [words.lower() for words in data.split()]
    return words

In [None]:
dataset['all_cleaned'].apply(remove_stopwords)
test_dataset_cleaned['all_cleaned'].apply(remove_stopwords)

In [None]:
dataset.head(520)

In [None]:
test_dataset_cleaned.head(520)

## Lemmatization

reducing a word to its root form

waches, watched --> watch (root form)

In [None]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
def lemmatization(data):
    lemmas = []
    for word in data.split():
        lemmas.append(lemmatizer.lemmatize(word))
    return " ".join(lemmas)

dataset['all_cleaned'].apply(lemmatization)
test_dataset_cleaned['all_cleaned'].apply(lemmatization)

In [None]:
dataset.head(520)

## Tokenization

In [None]:
def tokenize(string):
    tokens = string.split()
    return tokens
dataset['all_cleaned']= dataset['all_cleaned'].apply(lambda x: tokenize(x))
test_dataset_cleaned['all_cleaned']= test_dataset_cleaned['all_cleaned'].apply(lambda x: tokenize(x))

In [None]:
dataset.head(520)

## Vectorization

In [None]:
dataset['all_cleaned']= dataset['all_cleaned'].apply(lambda x: ' '.join([str(elem) for elem in x]))
test_dataset_cleaned['all_cleaned']= test_dataset_cleaned['all_cleaned'].apply(lambda x: ' '.join([str(elem) for elem in x]))

In [None]:
dataset.head(520)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(dataset['all_cleaned'])
X = X.toarray()
print(X.shape)

preparing_test_df = vectorizer.transform(test_dataset_cleaned['all_cleaned'])
preparing_test_df = preparing_test_df.toarray()
print(preparing_test_df.shape)

# Model Creation and Evaluation

## Preparing training and test sets

In [None]:
from sklearn.linear_model import LogisticRegression

X_train = np.array(X)
print(X_train.shape)
y_train = dataset['target']
print(y_train.shape)
X_test = np.array(preparing_test_df)
print(X_test.shape)

clf = LogisticRegression(solver='liblinear')
clf.fit(X_train, y_train)

# Prediction

In [None]:
prediction = clf.predict(X_test)

In [None]:
prediction

# Preparing to submit

In [None]:
submission = pd.DataFrame({"id":test_dataset["id"],"target":prediction})

In [None]:
submission.head()

In [None]:
submission.shape

In [None]:
submission.to_csv('./disaster_tweet_prediction_submission.csv', index = False)

> Took model inspiration from the kernel [here](https://www.kaggle.com/manasvardhan/a-beginner-s-guide-to-sentiment-analysis)~