In [2]:
# importing the file

import pandas as pd
import numpy as np
df=pd.read_csv("Eluvio_DS_Challenge.csv")

In [3]:
# all the basic imports

import matplotlib.pyplot as plt
import nltk
import re
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, accuracy_score

In [4]:
# removing irrelevant attributes

df = df.drop("category", axis = 1)
df = df.drop("down_votes", axis = 1)
df = df.drop("time_created", axis = 1)
df = df.drop("date_created", axis = 1)
df = df.drop("over_18", axis = 1)
df = df.drop("author", axis = 1)


df.head()


Unnamed: 0,up_votes,title
0,3,Scores killed in Pakistan clashes
1,2,Japan resumes refuelling mission
2,3,US presses Egypt on Gaza border
3,1,Jump-start economy: Give health care to all
4,4,Council of Europe bashes EU&UN terror blacklist


# Text preprocessing

In [5]:
# code for defining the stemmer

import nltk
nltk.download('punkt')
nltk.download('stopwords')

from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sahith\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sahith\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:

# To get the stems of words in a sentence.
def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems

# To get the words themself in a sentence.
def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens

In [7]:
title = df.title.str.lower()

In [8]:
# Get full stems and tokens to build vocabulary
def tokenized_stemmed(title):
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for i in title:
        allwords_stemmed = tokenize_and_stem(i) 
        totalvocab_stemmed.extend(allwords_stemmed) 

        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)
    return totalvocab_stemmed, totalvocab_tokenized

In [9]:

totalvocab_stemmed_, totalvocab_tokenized_ = tokenized_stemmed(title)

In [10]:
print(len(totalvocab_stemmed_))

7194609


In [11]:
# Rule out repetitions of stem-token pairs
totalvocab = zip(totalvocab_stemmed_, totalvocab_tokenized_)
totalvocab = list(set(totalvocab))
totalvocab_stemmed, totalvocab_tokenized = zip(*totalvocab)

In [12]:
print(len(totalvocab_stemmed))


114842


In [13]:
vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index = totalvocab_stemmed)

In [14]:
# Building the stop words set
import sklearn.feature_extraction.text as text
stopwords = nltk.corpus.stopwords.words('english')
my_stop_words = text.ENGLISH_STOP_WORDS.union(stopwords)

# Vectorizing text using Tf-idf 

In [15]:

# tf-idf vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(min_df =10**-3 ,analyzer = 'word', max_features=len(set(totalvocab_stemmed)), stop_words=my_stop_words, tokenizer=tokenize_and_stem, ngram_range=(1,3))

tfidf_matrix = tfidf_vectorizer.fit_transform(title)

print(tfidf_matrix.shape)

  'stop_words.' % sorted(inconsistent))


(509236, 1814)


In [16]:
tfidf_matrix

<509236x1814 sparse matrix of type '<class 'numpy.float64'>'
	with 3565430 stored elements in Compressed Sparse Row format>

# Preparing data for applying ML Classifiers

In [17]:
thre = np.quantile(df['up_votes'], 0.8)
y = [1 if i > thre else 0 for i in df['up_votes']]
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(tfidf_matrix, y, test_size = 0.2, shuffle = True, random_state = 42)

In [18]:
# using Multinomial Naive Bayes
clf = MultinomialNB()
clf.fit(X_train, y_train)
y_predict = clf.predict(X_test)
clf.score(X_test, y_test)
print(classification_report(y_test, y_predict))
print(accuracy_score(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89     81988
           1       0.56      0.00      0.00     19860

    accuracy                           0.81    101848
   macro avg       0.68      0.50      0.45    101848
weighted avg       0.76      0.81      0.72    101848

0.8050624459979577


In [19]:
# using LogisticRegression
LR = LogisticRegression(C=1.0, penalty='l2', tol=0.01)
LR.fit(X_train, y_train)
y_predict = LR.predict(X_test)
LR.score(X_test, y_test)
print(classification_report(y_test, y_predict))
print(accuracy_score(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.81      0.99      0.89     81988
           1       0.54      0.04      0.07     19860

    accuracy                           0.81    101848
   macro avg       0.68      0.52      0.48    101848
weighted avg       0.76      0.81      0.73    101848

0.8061817610556908


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [20]:
# By using Gradient Boosting Algorithm
gbdt = GradientBoostingClassifier()
gbdt.fit(X_train, y_train)
y_predict = gbdt.predict(X_test)
gbdt.score(X_test, y_test)
print(classification_report(y_test, y_predict))
print(accuracy_score(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.81      1.00      0.89     81988
           1       0.69      0.00      0.01     19860

    accuracy                           0.81    101848
   macro avg       0.75      0.50      0.45    101848
weighted avg       0.78      0.81      0.72    101848

0.805317728379546


# Conclusion 

The title was preprocessed using NLTK built in packages and used Tf-idf vectorizer to vectorize the text. Tf-idf was used because Tf-idf calculates the weight of a particular word rather than relying on number of occurances which other approaches do. Also, Tf-idf comes in handy when dealing with large amount of data as the repetition of words is seen often and relying on just frequency of words doesn't work. 

Data is split in 80% for training and 20% for testing using sklearn

Multinomial Naive Bayes, Logistic Regression and Gradient Boosting algorithms are working pretty well with the data provided. The accuracy of around 8.05 strenghtens our argument that there is a strong relation between the title and the up votes. The up votes are mostly related to the topics that audience are interested in. Also, to get a deeper understanding of the model, I've further trained an Recurrent Neural Network architecture on Google TPU (as I was asked to treat the problem as large-scale and will not fit into my RAM) to exploit the relationship between the titles and up votes. Please review the results and my approach of "Deep_Learning.ipynb"