<a href="https://colab.research.google.com/github/stgran/Coursework/blob/master/Practical%20Data%20Science/Preprocessing_Text_Data_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Real or Not? NLP with Disaster Tweets

### Columns
- ```id```  - a unique identifier for each tweet 
- ```text``` - the text of the tweet 
- ```location``` - the location the tweet was sent from (may be blank)
- ```keyword``` - a particular keyword from the tweet (may be blank)
- ```target``` - in train.csv only, this denotes whether a tweet is about a real - 
- ```disaster``` (1) or not (0)


# Package Import

In [125]:
## importing libraries
import pandas as pd # our main data management package
import matplotlib.pyplot as plt # our main display package
import string # used for preprocessing
import re # used for preprocessing
import numpy as np # used for managing NaNs
from nltk.tokenize import word_tokenize # our tokenizer
from nltk.corpus import stopwords # used for preprocessing
from nltk.stem import WordNetLemmatizer # used for preprocessing
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression # our model
from sklearn.model_selection import train_test_split
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data Import

In [0]:
data_urls = ['https://raw.githubusercontent.com/minerva-spring-pds/KaggleDisasterTweetsChallenge/master/Data/%s.csv'%ds for ds in ['train', 'test', 'sample_submission']]

train = pd.read_csv(data_urls[0])
test = pd.read_csv(data_urls[1])
sample_submission = pd.read_csv(data_urls[2])

# Data Exploration

In [80]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [81]:
train.count()

id          7613
keyword     7552
location    5080
text        7613
target      7613
dtype: int64

In [0]:
train = train.drop(['keyword', 'location'], axis = 1)
test = test.drop(['keyword', 'location'], axis = 1)

## Overview
The columns containing text are 'keyword', 'location', and 'text'. 'keyword' contains, ideally, the keyword in the text that made the tweet noteworthy in the context of disasters. It has 7552 of 7613 rows. 'location' refers to the location of the tweet and has only 5080 of 7613 rows. 'text' is the text of the tweet and has all 7613 rows.

# Preprocessing part 1
### Removing URLs/handles, making text lowercase, removing numbers, removing punctuation, tokenizing, removing stopwords, and lemmatizing

In [0]:
# remove urls, handles, and the hashtag from hashtags (taken from https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression)
def remove_urls(text):
  new_text = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ",text).split())
  return new_text

# make all text lowercase
def text_lowercase(text): 
  return text.lower()

# remove numbers
def remove_numbers(text): 
  result = re.sub(r'\d+', '', text) 
  return result

# remove punctuation
def remove_punctuation(text): 
  translator = str.maketrans('', '', string.punctuation)
  return text.translate(translator)

# tokenize
def tokenize(text):
  text = word_tokenize(text)
  return text

# remove stopwords
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
  text = [i for i in text if not i in stop_words]
  return text

# lemmatize
lemmatizer = WordNetLemmatizer()
def lemmatize(text):
  text = [lemmatizer.lemmatize(token) for token in text]
  return text

def preprocessing(text):
  text = text_lowercase(text)
  text = remove_urls(text)
  text = remove_numbers(text)
  text = remove_punctuation(text)
  text = tokenize(text)
  text = remove_stopwords(text)
  text = lemmatize(text)
  text = ' '.join(text)
  return text

In [0]:
# preprocessing the text columns
# train data
pp_text_train = [] # our preprocessed text column
for text_data in train['text']:
    pp_text_data = preprocessing(text_data)
    pp_text_train.append(pp_text_data)

train['pp_text'] = pp_text_train # add the preprocessed text as a column

# test data
pp_text_test = [] # our preprocessed text column
for text_data in test['text']:
    pp_text_data = preprocessing(text_data)
    pp_text_test.append(pp_text_data)

test['pp_text'] = pp_text_test # add the preprocessed text as a column

In [128]:
train.head()

Unnamed: 0,id,text,target,pp_text
0,1,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquake may allah forgive u
1,4,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,All residents asked to 'shelter in place' are ...,1,resident asked shelter place notified officer ...
3,6,"13,000 people receive #wildfires evacuation or...",1,people receive wildfire evacuation order calif...
4,7,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo ruby alaska smoke wildfire pour...


# Preprocessing part 2

In [0]:
# combining the train and test text data into a single corpus,
# which we will need to train the vectorizer
train_text_data = list(train['pp_text'])
test_text_data = list(test['pp_text'])

corpus = train_text_data + test_text_data

In [0]:
tf=TfidfVectorizer()

# the vectorizer must be fit onto the entire corpus
fitted_vectorizer = tf.fit(corpus)

# now we vectorize the train and test data separately
# train
train_transform = fitted_vectorizer.transform(train['pp_text'])
y = train['target'] 

# test
test_transform = fitted_vectorizer.transform(test['pp_text'])

In [131]:
print(train_transform)

  (0, 11223)	0.3565094534235048
  (0, 8491)	0.30689576472023705
  (0, 5201)	0.47724964027208006
  (0, 4146)	0.33382310221398653
  (0, 3403)	0.5053201351576412
  (0, 368)	0.42914155882309796
  (1, 12013)	0.4999968670933836
  (1, 11772)	0.4999968670933836
  (1, 9245)	0.30535331214468475
  (1, 7658)	0.3540085679476195
  (1, 5192)	0.2947343443136663
  (1, 5011)	0.21426166605848176
  (1, 2026)	0.38557380501712757
  (2, 12386)	0.5666750369385926
  (2, 11511)	0.2678768546920339
  (2, 10450)	0.4588681678344489
  (2, 9868)	0.22129413712583343
  (2, 9685)	0.2286107019766729
  (2, 9522)	0.3359683721273607
  (2, 4660)	0.25719090399439354
  (2, 4563)	0.21106286302334237
  (2, 766)	0.2678768546920339
  (3, 15310)	0.3614357147352493
  (3, 11245)	0.5801828834474566
  (3, 10256)	0.3039464200345903
  :	:
  (7610, 14732)	0.5019800665003111
  (7610, 7558)	0.4786761820583672
  (7610, 6109)	0.5497529715916672
  (7611, 13945)	0.263100092463339
  (7611, 13371)	0.2781595707273074
  (7611, 12260)	0.244890061570

# Testing our model on a validation set

In [0]:
X_train, X_test, y_train, y_test = train_test_split(train_transform, y)

In [0]:
scikit_log_reg = LogisticRegression()
model=scikit_log_reg.fit(X_train, y_train)

In [0]:
predictions = model.predict(X_test)

In [135]:
count = 0
for guess, answer in zip(predictions, y_test):
    if guess == answer:
        count += 1
print(count/len(y_test))

0.8025210084033614


# Kaggle Submission

In [60]:
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [0]:
# getting predictions for the test set
test_predictions = model.predict(test_transform)

In [0]:
# creating a dataframe
final_predictions = pd.DataFrame(test['id'])
final_predictions['target'] = test_predictions

In [63]:
final_predictions.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [0]:
from google.colab import files

In [0]:
# exporting the data
final_predictions.to_csv('logit_guesses_mark_i.csv', index=False)
files.download('logit_guesses_mark_i.csv')
# this file is ready to submit to Kaggle