In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.naive_bayes import MultinomialNB
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics.pairwise import euclidean_distances
import matplotlib.pyplot as plt
import seaborn as sns 
import string
from nltk.corpus import stopwords

In [None]:
train = pd.read_csv('../input/nlp-getting-started/train.csv')
test = pd.read_csv('../input/nlp-getting-started/test.csv')
sample = pd.read_csv('../input/nlp-getting-started/sample_submission.csv')

In [None]:
print("Shape of train data : ", train.shape)
print("Shape of test data : ", test.shape)

Let's check how data looks like,

In [None]:
train.head()

In [None]:
print("Missing value in train data :\n",train.isna().sum())
print("\nMissing value in test data :\n",test.isna().sum())

As we can see there are missing values in keyword and Location features.

## 2. Looking at Class Imbalance
It looks like we have 7,613 training samples. Let's see how many tweets we have that are examples of disaster versus those that are not. What we're looking at is whether or not we have a balance between samples that are both real examples of disasters, and those that are not.

In [None]:
#getting number of relevant and irrelevent tweets out of total 7,613 tweets in train dataset
train[['id','target']].groupby('target').count()

In [None]:
counts = pd.DataFrame(train["target"].value_counts())
counts.rename(columns={"target": "Samples"}, index={0: "Not Real", 1: "Real"}, inplace=True)
ax = sns.barplot(x=counts.index, y=counts.Samples)
for p in ax.patches:
    height = p.get_height()
    ax.text(
        x=p.get_x()+(p.get_width()/2),
        y=height,
        s=round(height),
        ha="center"
    )


As we can see above it is an imbalanced dataset, as number of irrelevant tweets is considerably higher than relevant ones.

In [None]:
train['tweet_len'] = train.apply(lambda row : len(row['text']), axis = 1)
test['tweet_len'] = test.apply(lambda row : len(row['text']), axis = 1)

In [None]:
test.head()

In [None]:
#Distribution of tweets length based on relevant/irrelevant fact
plt.figure(figsize=(10, 6))

train[train.target== 0].tweet_len.plot(bins=40, kind='hist', color='blue', 
                                       label='irrelevant', alpha=0.6)
train[train.target==1].tweet_len.plot(bins=40,kind='hist', color='red', 
                                      label='relevant', alpha=0.6)
plt.legend()
plt.xlabel("Length of text")
plt.show()

    Takes in a string of text, then performs the following:
    Vectorization1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text

In [None]:
def text_cleaning_process(text):
    STOPWORDS = stopwords.words('english')
    nopunc = [char for char in text if char not in string.punctuation]
    nopunc = ''.join(nopunc)
    return ' '.join([word for word in nopunc.split() if word.lower() not in STOPWORDS])

In [None]:
train['clean_text'] = train.apply(lambda row : text_cleaning_process(row['text']), axis = 1)
test['clean_text'] = test.apply(lambda row : text_cleaning_process(row['text']), axis = 1)

In [None]:
train.head()

In [None]:
# defining X (input) and y (label) from the dataframe columns for later use in COUNTVECTORIZER
X_train = train['clean_text'].values
y_train = train['target'].values
X_test = test['clean_text'].values
# y_test = df_tweet_test.target.values
#shape and dimension of X and y arrays
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)

In [None]:
xtrain, xvalid, ytrain, yvalid = train_test_split(X_train, y_train, 
                                                  stratify=y_train, 
                                                  random_state=42, 
                                                  test_size=0.3, shuffle=True)

In [None]:
print (xtrain.shape)
print (xvalid.shape)

### Building Basic Models
Let's start building our very first model.

Our very first model is a simple TF-IDF (Term Frequency - Inverse Document Frequency) followed by a simple Logistic Regression.

In [None]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid))
xtrain_tfv =  tfv.transform(xtrain) 
xvalid_tfv = tfv.transform(xvalid)

In [None]:
# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % log_loss(yvalid, predictions))

And there we go. We have our first model with a logloss of 0.490.

But we are greedy and want a better score. Lets look at the same model with a different data.

Instead of using TF-IDF, we can also use word counts as features. This can be done easily using CountVectorizer from scikit-learn.

In [None]:
ctv = CountVectorizer(analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [None]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % log_loss(yvalid, predictions))

We just improved our first model by 0.027!!!
