In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import nltk
from sklearn import feature_extraction, linear_model, model_selection
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# NLP with disaster tweets


This notebook is an attempt to create a submision to *Real or not? NLP with disaster tweets* competition.

As I have little experience with natural language processing, this will be more less trial and error thing. 



1. Loading the data
2. Data overview
3. Data cleanup
4. Running the model


# 1. Loading the data

In [None]:
train_data = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
train_data.info()

In [None]:
test_data = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")
test_data.info()

# 2. Training dataset overview

So, as we now have both datasets loaded, we can take a closer look at the training data.

In [None]:
train_data.head()

We have to handle the missing values in 'keyword' and 'location' columns. For both of them a good way to do this is to just put "no data" in the empty cells.

In [None]:
train_data = train_data.fillna(value = 'No data')
train_data.info()

Now the dataset is looking well, we can take a closer look on it.

In [None]:
train_data.target.value_counts().plot(kind='bar')

In [None]:
most_freq_location = train_data.location.value_counts()[:10].sort_values(ascending = False)
most_freq_location = most_freq_location.drop('No data')
most_freq_location

Looks like we have countries and cities mixed. I don't want to clean this up yet as we might need the most exact location there is available, but it looks like the vast majority of tweets is from the US.

... to be expanded

# 3. Processing tweet text for prediction


Before working on tweet text, we're going to clean up the text: remove stop words, punctuation and urls.

In [None]:
#from nltk.corpus import stopwords
#stop_words = stopwords.words('english')

#It seems that removing stopwords makes the model less efficient. I'm going to comment this part out for now.

In [None]:
#train_data['text'] = train_data['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))#
train_data['text'] = train_data['text'].str.replace('[^\w\s]','')
train_data['text'] = train_data['text'].str.replace("https?://[A-Za-z0-9./]*", "")
print(train_data.text)

In order to use a Logistic Regression model we have to vectorize our tweets first. We're going to do that using scikitlearn's CountVectorizer.

In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

In [None]:
train_vectors = count_vectorizer.fit_transform(train_data["text"])

test_vectors = count_vectorizer.transform(test_data["text"])

In [None]:
train_y = train_data.target
train_x = train_vectors

# 4. Model fitting and prediction 

Now, that we've established our training subsets, we can set up our model. Because our dependent variable is binary, I am going to try and evaluate logistic regression model. 

In [None]:
x_train, x_val, y_train, y_val = model_selection.train_test_split(train_x, train_y)
clf = linear_model.LogisticRegression(random_state=0, max_iter = 150).fit(x_train, y_train)
clf.score(x_val, y_val)

Ok, the validation score (mean accuracy on our test subset 'x_val' and labels 'y_val') is pretty high. Now, to the prediction:

In [None]:
predictions = clf.predict(test_vectors)

In [None]:
output = pd.DataFrame({'id': test_data.id,
                       'target': predictions})
output.to_csv('submission.csv', index = False)
print('submission saved!')