In [None]:
import os
import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn import svm
import matplotlib.pyplot as plt
import re #regex
import nltk
import string
# nltk.download('stopwords')
from nltk.corpus import stopwords
import seaborn as sns
from scipy.sparse import coo_matrix

train_data = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
test_data = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')
sample = pd.read_csv('../input/tweet-sentiment-extraction/sample_submission.csv')

In [None]:
# Pre-processing step - from https://www.kaggle.com/rajaram1988/ignored-stop-words-using-only-word-counts

# drop entries with no text
train_data = train_data.dropna()
train_data[train_data['text'].isna()]
test_data = test_data.dropna()

# convert text to lowercase
train_data['text'] = train_data['text'].map(lambda x: x.lower())
test_data['text'] = test_data['text'].map(lambda x: x.lower())

# remove '\\n'
train_data['text'] = train_data['text'].map(lambda x: re.sub('\\n', ' ', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub('\\n', ' ', str(x)))

# remove any text starting with User...
train_data['text'] = train_data['text'].map(lambda x: re.sub("\[\[User.*", '', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub("\[\[User.*", '', str(x)))

# remove IP addresses or user IDs
train_data['text'] = train_data['text'].map(lambda x: re.sub(
    "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub(
    "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', str(x)))

#remove http links in the text
train_data['text'] = train_data['text'].map(lambda x: re.sub(
    "\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", '', str(x)))
test_data['text'] = test_data['text'].map(lambda x: re.sub(
    "(http://.*?\s)|(http://.*)", '', str(x)))

# create a list of stop words and add custom stop words from the data set
stop_words = set(stopwords.words("english"))
print("# of stop words before join", len(stop_words))
# get 20 most common words and their counts
most_common = pd.Series(
    ' '.join(train_data['text']).split()).value_counts()[:20]

# add most commmon words to stop_words
stop_words = stop_words.union(most_common.keys())
print("# of stop words after join", len(stop_words))

In [None]:
x_train, x_val = train_test_split(train_data, train_size=0.8, random_state=0)

# set aside positive/negative/neutral tweets
positive_tweets = x_train[x_train['sentiment'] == 'positive']
negative_tweets = x_train[x_train['sentiment'] == 'negative']
neutral_tweets = x_train[x_train['sentiment'] == 'neutral']

# get lengths of 'selected_text' for non-neutral tweets
pos_selected_lengths = positive_tweets['selected_text'].map(lambda x: len(x.split()))
neg_selected_lengths = negative_tweets['selected_text'].map(lambda x: len(x.split()))

# plot 'selected_text' lengths against frequency in sentiment
plt.figure(figsize=(12, 6))
p1 = sns.kdeplot(pos_selected_lengths, shade=True, color="b").set_title(
    'Selected Text lengths across Positive and Negative Sentiments')
p2 = sns.kdeplot(neg_selected_lengths, shade=True, color="r")
plt.legend(labels=['positive', 'negative'])
plt.show()
plt.clf()
# based on this plot we can see that selected_text is more frequently shorter in positive tweets

In [None]:
# get lengths of 'text' for non-neutral tweets
pos_lengths = positive_tweets['text'].map(lambda x: len(x.split()))
neg_lengths = negative_tweets['text'].map(lambda x: len(x.split()))

# let's look at the lengths of positive vs negative tweets
plt.figure(figsize=(12,6))
p1 = sns.kdeplot(pos_lengths, shade=True, color="b").set_title(
    'Text Lengths across Positive and Negative Sentiments')
p2 = sns.kdeplot(neg_lengths, shade=True, color="r")
plt.legend(labels=['positive', 'negative'])
plt.show()
plt.clf()
# this doesn't seem all that useful. tweet lengths are distributed evenly in positive and negative tweets

In [None]:
# create feature vectors that include ngrams of size max_ngram 
# so we can select a feature that is a word or phrase to be our 'selected_text'
max_ngram = max(pos_selected_lengths) if max(pos_selected_lengths) > \
    max(neg_selected_lengths) else max(neg_selected_lengths)
min_ngram = min(pos_selected_lengths) if min(pos_selected_lengths) > \
    min(neg_selected_lengths) else min(neg_selected_lengths)
# this ended up being fruitless

In [None]:
vectorizer = CountVectorizer(
    max_df=0.85,
    min_df=2,
    stop_words=stop_words,
    max_features=10000,
)

# let's remove all neutral tweets from x_train so that we can train the cvm properly
non_neutral = x_train[x_train['sentiment'] != 'neutral']

# fit the vectorizer to the non_neutral training data
train_vectors = vectorizer.fit_transform(non_neutral['text'])
x_pos = vectorizer.transform(positive_tweets['text'])
x_neg = vectorizer.transform(negative_tweets['text'])

tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True)
tfidf_vectors = tfidf_transformer.fit_transform(train_vectors)

supportVector = svm.SVC(kernel='linear')
supportVector.fit(tfidf_vectors, non_neutral['sentiment'])

weights_dict = {}
features = vectorizer.get_feature_names()
for i in range(len(features)):
    feature = features[i]
    weights_dict[feature] = supportVector.coef_[0, i]
    
# get bag of words and weights of key words/phrases in negative tweets
weights_list = [(word, weights_dict[word])
              for word in features]
weights_sorted = sorted(weights_list, key=lambda x: x[1],
                    reverse=True)

In [None]:
# get the top 50 words and plot them 
top_50_words = weights_sorted[:50]
weight_top_df = pd.DataFrame(top_50_words)
weight_top_df.columns = ["Word", "Weight"]

sns.set(rc={'figure.figsize': (13, 8)})
g = sns.barplot(x="Word", y="Weight", data=weight_top_df)
g.set_xticklabels(g.get_xticklabels(), rotation=60)
plt.show()
plt.clf()

In [None]:
# get the bottom 50 words and plot them
bot_50_words = weights_sorted[len(weights_sorted) - 50:]
weight_bot_df = pd.DataFrame(bot_50_words)
weight_bot_df.columns = ["Word", "Weight"]

sns.set(rc={'figure.figsize': (13, 8)})
g = sns.barplot(x="Word", y="Weight", data=weight_bot_df)
g.set_xticklabels(g.get_xticklabels(), rotation=60)
plt.show()
plt.clf()

In [None]:
# it looks like positive words have a negative weight and negative words have a positive weight
# so we should calculate selected_text based on sentiment
inv_weights_dict = {}
for key in weights_dict.keys():
    inv_weights_dict[key] = weights_dict[key] * -1

In [None]:
def calc_selected_text(x, tol, a):
    tweet = x['text']
    sentiment = x['sentiment']

    if sentiment == 'neutral':
        return tweet
    if sentiment == 'positive':
        weights = weights_dict
    if sentiment == 'negative':
        weights = inv_weights_dict

    text = tweet.split()
    text_len = len(text)

    subsets = [text[i:j+1]
               for i in range(text_len) for j in range(i, text_len)]

    subsets = sorted(subsets, key=len)

    score = 0
    selected = ''
    for i in range(len(subsets)):
        subtr_sum = 0

        for p in range(len(subsets[i])):
            words_in_substr = subsets[i][p].translate(
                str.maketrans('', '', string.punctuation))
            if(words_in_substr in weights.keys()):
                # We noticed that our selected strings were ~375% longer than they should be, so we implemented a "cost function"
                # to encourage smaller strings
                subtr_sum += weights[words_in_substr] - a * (len(words_in_substr) / text_len)

        #tol = tol*5 # Increase the tolerance a bit each time we choose a selection
        if(subtr_sum > score + tol):
            score = subtr_sum
            selected = subsets[i]

    if len(selected) == 0:
        selected = text

    return ' '.join(selected)

# from https://www.kaggle.com/rajaram1988/ignored-stop-words-using-only-word-counts
def jaccard(str1, str2):
    a = set(str1.lower().split())
    b = set(str2.lower().split())
    # print("{} - {}".format(str1, str2))
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))


In [None]:
# to prevent warnings appearing in the console
pd.options.mode.chained_assignment = None

# some parameters for the text selector
tol = 0.0015
a = 5
print("tol = {}".format(tol))
print("a = {}".format(a))

In [None]:
# make predictions on training set
x_train['prediction'] = ''
for key, row in x_train.iterrows():
    selected = calc_selected_text(row, tol, a)

    x_train.loc[x_train['textID'] == row['textID'], ['prediction']] = selected

x_train['jaccard'] = x_train.apply(
    lambda x: jaccard(x['selected_text'], x['prediction']), axis=1)

print('Jaccard for training set = ', np.mean(x_train['jaccard']))

In [None]:
# make predictions on validation set
x_val['prediction'] = ''
for key, row in x_val.iterrows():
    selected = calc_selected_text(row, tol, a)

    x_val.loc[x_val['textID'] == row['textID'], ['prediction']] = selected

x_val['jaccard'] = x_val.apply(
    lambda x: jaccard(x['selected_text'], x['prediction']), axis=1)

print('Jaccard for validation set = ', np.mean(x_val['jaccard']))

In [None]:
# make final submission
test_data['prediction'] = ''
for index, row in test_data.iterrows():
    selected_text = calc_selected_text(row, tol, a)

    sample.loc[sample['textID'] == row['textID'], ['selected_text']] = selected_text

sample.to_csv('submission.csv', index=False)