# Eluv.io Data Science/ML Challenge

#### Sourya Sarthak Reddy Sane

# Importing Packages

In [1]:
import pickle
import matplotlib as plt
import pandas as pd
import numpy as np
import ast
import tensorflow as tf
from gsdmm import MovieGroupProcess
import re
import nltk

# Reading the data

In [2]:
df = pd.read_csv('Eluvio_DS_Challenge.csv')

In [3]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews


In [4]:
df.tail()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
509231,1479816764,2016-11-22,5,0,Heil Trump : Donald Trump s alt-right white...,False,nonamenoglory,worldnews
509232,1479816772,2016-11-22,1,0,There are people speculating that this could b...,False,SummerRay,worldnews
509233,1479817056,2016-11-22,1,0,Professor receives Arab Researchers Award,False,AUSharjah,worldnews
509234,1479817157,2016-11-22,1,0,Nigel Farage attacks response to Trump ambassa...,False,smilyflower,worldnews
509235,1479817346,2016-11-22,1,0,Palestinian wielding knife shot dead in West B...,False,superislam,worldnews


# Exploratory Data Analysis (EDA)

In [5]:
df.dtypes

time_created     int64
date_created    object
up_votes         int64
down_votes       int64
title           object
over_18           bool
author          object
category        object
dtype: object

converting date_created to datetime format

In [6]:
df['date_created'] = pd.to_datetime(df['date_created'])

In [7]:
df.shape

(509236, 8)

The dataset contains 509236 rows and 8 columns

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509236 entries, 0 to 509235
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype         
---  ------        --------------   -----         
 0   time_created  509236 non-null  int64         
 1   date_created  509236 non-null  datetime64[ns]
 2   up_votes      509236 non-null  int64         
 3   down_votes    509236 non-null  int64         
 4   title         509236 non-null  object        
 5   over_18       509236 non-null  bool          
 6   author        509236 non-null  object        
 7   category      509236 non-null  object        
dtypes: bool(1), datetime64[ns](1), int64(3), object(3)
memory usage: 27.7+ MB


In [11]:
df[['up_votes','down_votes']].describe()

Unnamed: 0,up_votes,down_votes
count,509236.0,509236.0
mean,112.236283,0.0
std,541.694675,0.0
min,0.0,0.0
25%,1.0,0.0
50%,5.0,0.0
75%,16.0,0.0
max,21253.0,0.0


There are absolutely no down votes

In [13]:
df.isnull().sum().sum()

0

There are no null values in our dataset.

In [14]:
df['category'].value_counts()

worldnews    509236
Name: category, dtype: int64

There is only one category in the dataset.

In [15]:
df['over_18'].value_counts()

False    508916
True        320
Name: over_18, dtype: int64

There are only 320 over_18 articles out of 509236

In [18]:
df['author'].value_counts().nlargest(10)

davidreiss666    8897
anutensil        5730
DoremusJessup    5037
maxwellhill      4023
igeldard         4013
readerseven      3170
twolf1           2923
madam1           2658
nimobo           2564
madazzahatter    2503
Name: author, dtype: int64

The authors with highest number of articles written from the dataset

In [19]:
df.groupby(['author'])['up_votes'].sum().nlargest(10)

author
maxwellhill        1985416
anutensil          1531544
Libertatea          832102
DoremusJessup       584380
Wagamaga            580121
NinjaDiscoJesus     492582
madazzahatter       428966
madam1              390541
davidreiss666       338306
kulkke              333311
Name: up_votes, dtype: int64

Though davidreiss666 has written highest number of artiles, maxwellhill is the person with highest number of upvotes.

Finding the number of characters in title:

In [23]:
df['len'] = df['title'].apply(len)
display(df['len'].describe())
display(df.head())
display(df[df['len'] == 320]['title'])

count    509236.000000
mean         89.446082
std          59.701217
min           1.000000
25%          52.000000
50%          66.000000
75%         100.000000
max         320.000000
Name: len, dtype: float64

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category,len
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews,33
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews,32
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews,31
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews,44
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews,47


163718    If Mexico wants to launch anti-corruption crus...
Name: title, dtype: object

There are 320 charcters in the largest title. The biggest title has ID - 163718

In [24]:
df.groupby(['date_created'])['title'].count().nlargest(10)

date_created
2015-11-24    458
2015-11-18    418
2014-08-05    417
2015-11-16    417
2014-04-24    416
2014-03-05    415
2014-07-30    415
2014-09-02    415
2013-12-11    413
2015-12-01    409
Name: title, dtype: int64

Highest number of articles were published/created on 2015-11-24

In [25]:
df.groupby(df['date_created'].dt.to_period('Q'))['title'].count().nlargest(10)

date_created
2015Q4    26691
2014Q1    25293
2014Q3    25161
2015Q1    24922
2013Q2    23787
2013Q3    23708
2016Q2    23571
2013Q4    23322
2016Q1    22891
2016Q3    22181
Freq: Q-DEC, Name: title, dtype: int64

Highest number of articles were published/created on 2015 Quarted 4 followed by Quarter 1 of 2014.

In [27]:
df.groupby(df['date_created'].dt.to_period('Q'))['up_votes'].sum().nlargest(10)

date_created
2015Q4    4382531
2016Q3    4369915
2016Q2    4213514
2016Q1    3888922
2015Q3    3888102
2015Q1    3604233
2015Q2    3406622
2014Q3    3187639
2014Q4    3094400
2016Q4    2432456
Freq: Q-DEC, Name: up_votes, dtype: int64

Highest number of up votes were received for articles published on Quarted 4 of 2015 followed by Quarter 3 of 2016. Also, after further analysis, it was clear that all the articles posted on Quarter 4 of every year (2008 - 2016) have receieved highest number of votes as compared to other three quarters

Did not use any kind of visualisations as we are dealing with BIG data and a simple bar plot would take a very long time to get plotted. It is always a feasible solution to make use of BI tools such as Tableau or Power BI to visualise and perform visualisations on Big Data.

# Topic Modelling

## Objective

The goal is to perform topic modelling on title. This would help us categorize the articles based on the title.

## Process

The most popular Topic Modeling algorithm is LDA, known as Latent Dirichlet Allocation. Though LDA is widely used of topic modelling, it performs well only on medium or large texts (over 50 words).

The dataset we have contain very short or short texts. So, I am using Gibbs Sampling Dirichlet Mixture Model (GSDMM) model which is an  altered LDA algorithm which showed great results on STTM (Short Text Topic Modelling) tasks.

## Pre-processing Data

Converting text to lower case

In [29]:
df['tokenized_sents'] = df.apply(lambda row: row['title'].lower(), axis=1)

Removing Numerical values from the text

In [30]:

def no_num(row):
    input_str = row['tokenized_sents']
    result = re.sub(r'\d+', '', input_str)
    return result

df['tokenized_sents'] = df.apply(no_num, axis=1)

Tokenizing the data

In [31]:
def identify_tokens(row):
    tokenized_sents = row['tokenized_sents']
    tokens = nltk.word_tokenize(tokenized_sents)
    # taken only words (not punctuation)
    token_words = [w for w in tokens if w.isalpha()]
    return token_words

df['tokenized_sents'] = df.apply(identify_tokens, axis=1)

In [32]:
df

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category,len,tokenized_sents
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews,33,"[scores, killed, in, pakistan, clashes]"
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews,32,"[japan, resumes, refuelling, mission]"
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews,31,"[us, presses, egypt, on, gaza, border]"
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews,44,"[economy, give, health, care, to, all]"
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews,47,"[council, of, europe, bashes, eu, un, terror, ..."
...,...,...,...,...,...,...,...,...,...,...
509231,1479816764,2016-11-22,5,0,Heil Trump : Donald Trump s alt-right white...,False,nonamenoglory,worldnews,88,"[heil, trump, donald, trump, s, white, nationa..."
509232,1479816772,2016-11-22,1,0,There are people speculating that this could b...,False,SummerRay,worldnews,67,"[there, are, people, speculating, that, this, ..."
509233,1479817056,2016-11-22,1,0,Professor receives Arab Researchers Award,False,AUSharjah,worldnews,41,"[professor, receives, arab, researchers, award]"
509234,1479817157,2016-11-22,1,0,Nigel Farage attacks response to Trump ambassa...,False,smilyflower,worldnews,55,"[nigel, farage, attacks, response, to, trump, ..."


Stemming

In [33]:
from nltk.stem import PorterStemmer
stemming = PorterStemmer()

def stem_list(row):
    my_list = row['tokenized_sents']
    stemmed_list = [stemming.stem(word) for word in my_list]
    return (stemmed_list)

df['tokenized_sents'] = df.apply(stem_list, axis=1)

Removing Stop Words

In [34]:
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))                  

def remove_stops(row):
    my_list = row['tokenized_sents']
    meaningful_words = [w for w in my_list if not w in stops]
    return (meaningful_words)

df['tokenized_sents'] = df.apply(remove_stops, axis=1)

In [35]:
df.head()

Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category,len,tokenized_sents
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews,33,"[score, kill, pakistan, clash]"
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews,32,"[japan, resum, refuel, mission]"
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews,31,"[us, press, egypt, gaza, border]"
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews,44,"[economi, give, health, care]"
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews,47,"[council, europ, bash, eu, un, terror, blacklist]"


## Modelling

In [36]:
#print("Max number of token:", np.max(df['tokenized_sents']))
#print("Mean number of token:", round(np.mean(df.nb_token),2)

# Input format for the model : list of strings (list of tokens)
docs = df['tokenized_sents'].tolist()
vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)

print("Voc size:", n_terms)
print("Number of documents:", len(docs))

Voc size: 58328
Number of documents: 509236


In [24]:
mgp = MovieGroupProcess(K=30, alpha=0.1, beta=0.1, n_iters=10)

vocab = set(x for doc in docs for x in doc)
n_terms = len(vocab)
n_docs = len(docs)

# Fit the model 
y = mgp.fit(docs, n_terms)

# Save model

filename = 'f'
pickle.dump(mgp, open(filename, 'wb'))

In stage 0: transferred 481906 clusters with 30 clusters populated
In stage 1: transferred 410079 clusters with 30 clusters populated
In stage 2: transferred 250457 clusters with 30 clusters populated
In stage 3: transferred 167520 clusters with 30 clusters populated
In stage 4: transferred 137578 clusters with 30 clusters populated
In stage 5: transferred 124391 clusters with 30 clusters populated
In stage 6: transferred 118354 clusters with 30 clusters populated
In stage 7: transferred 114945 clusters with 30 clusters populated
In stage 8: transferred 112039 clusters with 30 clusters populated
In stage 9: transferred 109902 clusters with 30 clusters populated


Model took more than 1 hour to train. Saving the model to disk to save time in the future.

In [39]:
mgp = pickle.load(open('f', 'rb'))

In [40]:
def top_words(cluster_word_distribution, top_cluster, values):
    for cluster in top_cluster:
        sort_dicts =sorted(mgp.cluster_word_distribution[cluster].items(), key=lambda k: k[1], reverse=True)[:values]
        print('Cluster %s : %s'%(cluster,sort_dicts))
        print(" — — — — — — — — — ")

In [41]:
doc_count = np.array(mgp.cluster_doc_count)
print('Number of documents per topics :', doc_count)
print('*'*120)

# Topics sorted by document inside
top_index = doc_count.argsort()[-50:][::-1]
print('Most important clusters (by number of docs inside):', top_index)
print('*'*120)


# Show the top 5 words by cluster, it helps to make the topic_dict below
top_words(mgp.cluster_word_distribution, top_index, 5)

Number of documents per topics : [21823 11484 10224 28756 12238 23566 15894 13369 16797 30193  4795 22870
 12132 10762  9672 19142 14188 15229 20591  7747 19564 16259 20854 22331
 18744 10649 12574 25008 21301 20480]
************************************************************************************************************************
Most important clusters (by number of docs inside): [ 9  3 27  5 11 23  0 28 22 18 29 20 15 24  8 21  6 17 16  7 26  4 12  1
 13 25  2 14 19 10]
************************************************************************************************************************
Cluster 9 : [('syria', 10109), ('syrian', 4814), ('us', 3688), ('say', 3635), ('rebel', 3003)]
 — — — — — — — — — 
Cluster 3 : [('kill', 11730), ('attack', 5311), ('bomb', 3728), ('syria', 2600), ('milit', 2508)]
 — — — — — — — — — 
Cluster 27 : [('china', 4666), ('oil', 3443), ('world', 2639), ('global', 1813), ('energi', 1760)]
 — — — — — — — — — 
Cluster 5 : [('israel', 8705), ('iran', 5101

Creating a topic dictionary

In [42]:
topic_dict = {}
topic_names = ['Syria',
              'Syria',
              'Global Climate & Energy',
              'Middle East',
              'Law',
              'Crime',
              'Crime',
              'Law',
              'Terrorism',
              'Europe',
              'Russia',
              'Asia',
              'USA',
              'World News',
              'Health',
              'Middle East',
              'Syria',
              'World News',
              'Accident',
              'Protest',
              'Europe',
              'Crime',
              'Finance',
              'Science',
              'Natural Disaster',
              'Korea',
              'Asia',
              'Politics',
              'Crime',
              'Asia']

for i, topic_num in enumerate(top_index):
    topic_dict[topic_num]=topic_names[i]

In [43]:
def best_label(row):
    my_list = row['tokenized_sents']
    label, _ = mgp.choose_best_label(my_list)
    topic = topic_dict.get(label)
    return topic

df['topic'] = df.apply(best_label, axis=1)

In [44]:
df[['title', 'topic']].head(20)

Unnamed: 0,title,topic
0,Scores killed in Pakistan clashes,Syria
1,Japan resumes refuelling mission,Asia
2,US presses Egypt on Gaza border,Middle East
3,Jump-start economy: Give health care to all,World News
4,Council of Europe bashes EU&UN terror blacklist,Middle East
5,Hay presto! Farmer unveils the illegal mock-...,Finance
6,"Strikes, Protests and Gridlock at the Poland-U...",Protest
7,The U.N. Mismanagement Program,Middle East
8,Nicolas Sarkozy threatens to sue Ryanair,Europe
9,US plans for missile shields in Polish town me...,Syria


In [45]:
df[['title', 'topic']].tail(20)

Unnamed: 0,title,topic
509216,Turkish court issues arrest warrant for PYD he...,Middle East
509217,US issues Europe travel alert after France foi...,Terrorism
509218,"Over 15,500 public personnel dismissed with ne...",Middle East
509219,Four killed in New Caledonia landslides: Frenc...,Protest
509220,Philippine opposition senator accusing preside...,Crime
509221,"A soft Brexit is best for Norway, trade minist...",Europe
509222,Trump s pullout of TPP opens way for China,Global Climate & Energy
509223,Partnerships between US and Chinese scientists...,Global Climate & Energy
509224,"The Cooper Hewitt, Smithsonian Design Museum i...",World News
509225,"U.S. strike destroys bridge, restricts Islamic...",Syria


We do not have a test set or an already existing data with topic modelling to test our model's accuracy.

### --- END ---