# Creating a CNN baseline model with word embedding

In [1]:
import pandas as pd
import numpy as np

## Creating labels and max_retweets

In [2]:
df = pd.read_csv("../../data/Organic_extended_finalv2.csv", sep="|", index_col=0)
df.columns

Index(['tweet_id', 'created_time', 'count', '1', '2', '3', '4', '5', '6',
       'user_id', 'screen_name', 'url', 'follower_count', 'title', 'content',
       'expanded_url', 'title_len', 'content_len'],
      dtype='object')

In [3]:
possible_cols = {str(x) for x in range(1,101)}
actual_cols = set(df.columns).intersection(possible_cols)
print(actual_cols)
df['max_retweets']= df[actual_cols].max(axis=1)

{'1', '4', '2', '5', '6', '3'}


In [4]:
df['label'] = 0
df['median'] = df.groupby('user_id')['max_retweets'].transform('median')
df.loc[df['max_retweets']>=df['median'],'label'] = 1
df[['user_id','max_retweets','median','label']].head(15)

Unnamed: 0,user_id,max_retweets,median,label
0,16664681.0,466.0,18.0,1
1,28785486.0,163.0,77.0,1
2,759251.0,936.0,141.0,1
3,807095.0,2381.0,113.0,1
4,16664681.0,267.0,18.0,1
5,5392522.0,938.0,117.0,1
6,16664681.0,33.0,18.0,1
7,16664681.0,72.0,18.0,1
8,16664681.0,48.0,18.0,1
9,16664681.0,70.0,18.0,1


In [5]:
# if not \t we case use as delimiter
df.loc[df['title'].str.contains("\t")]

Unnamed: 0,tweet_id,created_time,count,1,2,3,4,5,6,user_id,...,url,follower_count,title,content,expanded_url,title_len,content_len,max_retweets,label,median


## Creating a training and a validation split
Creating a 75-25 split.

In [6]:
train = df.sample(frac=0.75, random_state=10)
test = df.drop(train.index)

In [7]:
train = train.reset_index(drop=True)
train.head()

Unnamed: 0,tweet_id,created_time,count,1,2,3,4,5,6,user_id,...,url,follower_count,title,content,expanded_url,title_len,content_len,max_retweets,label,median
0,1275526557214412808,Tue Jun 23 20:30:04 +0000 2020,8,213,213.0,213.0,213.0,212.0,212.0,807095.0,...,https://nyti.ms/2B5Iqal,46861283.0,Trump Family Asks Court to Stop Publication of...,Officials at the Trump Organization declined t...,https://www.nytimes.com/2020/06/23/us/politics...,12,231,213.0,1,113.0
1,1274359312375255041,Sat Jun 20 15:11:51 +0000 2020,11,33,34.0,33.0,33.0,33.0,33.0,759251.0,...,https://cnn.it/2YiQWvJ,48817600.0,Boss Files with Poppy Harlow,As restaurants across the country are being or...,https://www.cnn.com/audio/podcasts/boss-files?...,5,104,34.0,0,141.0
2,1273583769547653131,Thu Jun 18 11:50:07 +0000 2020,13,19,19.0,18.0,18.0,18.0,18.0,1652541.0,...,https://reut.rs/3efFRAV,22072396.0,"Breakingviews - Corona Capital: Carnival, Chea...",NEW YORK/MILAN/HONG KONG/LONDON (Reuters Break...,https://www.reuters.com/article/us-health-coro...,8,975,19.0,0,32.0
3,1273208688585707521,Wed Jun 17 10:59:41 +0000 2020,14,30,30.0,30.0,30.0,30.0,30.0,1652541.0,...,https://reut.rs/2YHwS4Z,22072395.0,Beijing says COVID-19 cases could stay at curr...,FILE PHOTO: Medical workers in protective suit...,https://www.reuters.com/article/us-health-coro...,12,88,30.0,0,32.0
4,1273715264731918336,Thu Jun 18 20:32:38 +0000 2020,13,31,31.0,31.0,31.0,31.0,30.0,14511951.0,...,http://huffp.st/ZOky0Eo,11454122.0,"Unilever Calls Out Racism, But Still Sells Ski...",It’s a sign of a changing culture that so many...,https://www.huffpost.com/entry/unilever-skin-l...,12,1105,31.0,0,37.5


In [8]:
test = test.reset_index(drop=True)
test.head()

Unnamed: 0,tweet_id,created_time,count,1,2,3,4,5,6,user_id,...,url,follower_count,title,content,expanded_url,title_len,content_len,max_retweets,label,median
0,1272220034065186817,2020-06-14 17:31:07+00:00,17,910,927.0,929.0,933.0,934.0,936.0,759251.0,...,https://cnn.it/3d4Az9Y,48817611.0,Woman becomes first observant Sikh to graduate...,(CNN) A woman has made history by becoming the...,https://www.cnn.com/2020/06/13/us/first-sikh-w...,15,190,936.0,1,141.0
1,1272219784743202816,2020-06-14 17:30:08+00:00,17,2352,2377.0,2381.0,2378.0,2376.0,2373.0,807095.0,...,https://nyti.ms/2YDi6vY,46861284.0,"As Social Distancing Wanes, Cuomo Warns of Ano...","michael barbaro\n\nDonald, the pandemic feels ...",https://www.nytimes.com/2020/06/14/world/coron...,9,4174,2381.0,1,113.0
2,1272226052639686659,2020-06-14 17:55:02+00:00,17,630,634.0,633.0,632.0,632.0,631.0,28785486.0,...,https://abcn.ws/2UJ1PVa,15735968.0,What to know about police reforms after George...,What to know about police reforms after George...,https://abcnews.go.com/US/police-reforms-georg...,17,2438,634.0,1,77.0
3,1272242929973841928,2020-06-14 19:02:06+00:00,17,116,117.0,116.0,114.0,114.0,114.0,14173315.0,...,https://nbcnews.to/3d3ThP7,7668571.0,IndyCar flagman fired after criticizing NASCAR...,Get breaking news alerts and special reports. ...,https://www.nbcnews.com/news/us-news/indycar-f...,10,352,117.0,1,73.5
4,1272249963234373632,2020-06-14 19:30:03+00:00,17,187,190.0,189.0,189.0,189.0,189.0,759251.0,...,https://cnn.it/2zARe7A,48817609.0,"For black NASCAR fans, the Confederate flag ba...","(CNN) Growing up in the middle of nowhere, Mar...",https://www.cnn.com/2020/06/14/us/nascar-black...,13,1697,190.0,1,141.0


## Metrics on the train and test

In [9]:
print(len(train))
train.groupby('screen_name')['max_retweets'].agg(['count', 'median','mean'])

9256


Unnamed: 0_level_0,count,median,mean
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABC News,757,78.0,220.840159
CNN,855,139.0,267.301754
Fox News,77,152.0,231.233766
HuffPost,321,38.0,74.919003
Los Angeles Times,1126,18.0,67.48579
NBC News,550,73.0,153.552727
NPR,329,122.0,319.851064
Reuters,2455,32.0,52.905906
TIME,479,59.0,121.271399
The Associated Press,221,154.0,489.638009


In [10]:
print(len(test))
test.groupby('screen_name')['max_retweets'].agg(['count', 'median','mean'])

3085


Unnamed: 0_level_0,count,median,mean
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ABC News,248,74.5,182.346774
CNN,288,143.0,267.204861
Fox News,19,243.0,243.736842
HuffPost,97,35.0,68.783505
Los Angeles Times,381,18.0,54.742782
NBC News,174,74.5,179.011494
NPR,93,90.0,265.397849
Reuters,868,33.0,52.313364
TIME,170,60.5,113.517647
The Associated Press,75,148.0,370.386667


In [11]:
train.to_csv("train.csv",sep="|")
test.to_csv("test.csv",sep="|")
# train.to_csv("train.tsv",sep="\t")
# test.to_csv("test.tsv",sep="\t")

## Creating Vocabulary

In [12]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from collections import Counter

In [13]:
df = pd.read_csv("./train.csv",sep="|", index_col=0)
stopwords = stopwords.words('english')
df.columns, stopwords[:5]

(Index(['tweet_id', 'created_time', 'count', '1', '2', '3', '4', '5', '6',
        'user_id', 'screen_name', 'url', 'follower_count', 'title', 'content',
        'expanded_url', 'title_len', 'content_len', 'max_retweets', 'label',
        'median'],
       dtype='object'),
 ['i', 'me', 'my', 'myself', 'we'])

In [14]:
# Remove stop words
# remove punctuations and numbers
# remove single alphabets
def clean_text(text):
    text = " ".join([word for word in text.split() if word not in stopwords])
    text = re.sub(r"[^\x00-\x7F]+", " ", text)
    regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
    nopunct_num = regex.sub(" ", text.lower()) 
    # Removing the odd apostrophes
    tokens = [token for token in nopunct_num.split() if len(token)>=2 and token not in stopwords]
    return tokens

In [15]:
counter = Counter()
tokenizer = {"UNK":0}
def update_counter(text):
    counter.update(clean_text(text))
df['title'].apply(update_counter)
print(counter.most_common(5))
vocab = list(counter)
print(vocab[:5],len(vocab))

[('trump', 1079), ('coronavirus', 979), ('says', 668), ('new', 628), ('covid', 625)]
['trump', 'family', 'asks', 'court', 'stop'] 9946


In [16]:
with open("vocab.txt","w+") as file:
    file.write("\n".join(vocab))

## Creating a tokenizer

In [17]:
import numpy as np

In [18]:
class Tokenizer:
    def __init__(self, vocab_path):
        self._tokens = {"UNK" : 0}
        with open(vocab_path, 'r') as file:
            counter = 1
            for line in file.readlines():
                word = line.strip()
                if word not in self._tokens:
                    self._tokens[word] = counter
                    counter +=1
    def encode(self, sentence):
        return [self._tokens.get(word, 'UNK') for word in sentence.split()]
tokenizer = Tokenizer("./vocab.txt")
tokenizer.encode("trump asks court stay lovely")

[1, 3, 4, 26, 38]