# PROBLEM STATEMENT: What Year is it?

## Using NLP techniques on date_created and title, predict the probability of when an event occurred

In [1]:
import pandas as pd
import re
import nltk
nltk.download('wordnet')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
df = pd.read_csv('Eluvio_DS_Challenge.csv')
df

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/quentinpompliano/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/quentinpompliano/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,time_created,date_created,up_votes,down_votes,title,over_18,author,category
0,1201232046,2008-01-25,3,0,Scores killed in Pakistan clashes,False,polar,worldnews
1,1201232075,2008-01-25,2,0,Japan resumes refuelling mission,False,polar,worldnews
2,1201232523,2008-01-25,3,0,US presses Egypt on Gaza border,False,polar,worldnews
3,1201233290,2008-01-25,1,0,Jump-start economy: Give health care to all,False,fadi420,worldnews
4,1201274720,2008-01-25,4,0,Council of Europe bashes EU&UN terror blacklist,False,mhermans,worldnews
...,...,...,...,...,...,...,...,...
509231,1479816764,2016-11-22,5,0,Heil Trump : Donald Trump s alt-right white...,False,nonamenoglory,worldnews
509232,1479816772,2016-11-22,1,0,There are people speculating that this could b...,False,SummerRay,worldnews
509233,1479817056,2016-11-22,1,0,Professor receives Arab Researchers Award,False,AUSharjah,worldnews
509234,1479817157,2016-11-22,1,0,Nigel Farage attacks response to Trump ambassa...,False,smilyflower,worldnews


## Data preprocessing

In [2]:
lemmatizer = WordNetLemmatizer()

def data_preprocessing(title): 
    title = title.lower()
    title = [lemmatizer.lemmatize(word) for word in title]
    title = ''.join(title)
    
    return title

def get_year_preprocessing(date):
    date = date[0:4]
    date = int(date)
    return date

df['preprocessed_title'] = df['title'].apply(lambda title: data_preprocessing(title))
df['year'] = df['date_created'].apply(lambda date: get_year_preprocessing(date))

new_df = df[['preprocessed_title', 'year']].copy()
new_df

Unnamed: 0,preprocessed_title,year
0,scores killed in pakistan clashes,2008
1,japan resumes refuelling mission,2008
2,us presses egypt on gaza border,2008
3,jump-start economy: give health care to all,2008
4,council of europe bashes eu&un terror blacklist,2008
...,...,...
509231,heil trump : donald trump s alt-right white...,2016
509232,there are people speculating that this could b...,2016
509233,professor receives arab researchers award,2016
509234,nigel farage attacks response to trump ambassa...,2016


## Training and test groups

In [28]:
from sklearn.model_selection import train_test_split

data = new_df.copy()
y = data['year'].values
data.drop(['year'], axis=1, inplace=True) # data now only contains preprocessed_title

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size = 0.3, stratify=y)
                                                   #(preprocessed_title, year, %30 test size)
print("Train data:", X_train.shape, y_train.shape)
print("Test data:", X_test.shape, y_test.shape)

Train data: (356465, 1) (356465,)
Test data: (152771, 1) (152771,)


## Most frequent occuring n-grams in data

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

vect = TfidfVectorizer(ngram_range=(2,5), stop_words='english')
#summaries = ''.join(new_df['preprocessed_title'])
summaries = ''.join(X_train['preprocessed_title'])
ngram_summaries = vect.build_analyzer()(summaries)

Counter(ngram_summaries).most_common(20)

[('prime minister', 3050),
 ('north korea', 2826),
 ('year old', 2635),
 ('islamic state', 2592),
 ('human rights', 2211),
 ('saudi arabia', 1396),
 ('united states', 1388),
 ('al qaeda', 1364),
 ('climate change', 1226),
 ('south korea', 1122),
 ('south china', 1084),
 ('middle east', 971),
 ('west bank', 947),
 ('hong kong', 912),
 ('china sea', 887),
 ('foreign minister', 852),
 ('boko haram', 835),
 ('death toll', 826),
 ('north korean', 807),
 ('chemical weapons', 795)]

## Vectorize using TF-IDF Vectorizer and create X matrix

In [22]:
from sklearn.linear_model import LogisticRegression

title_text = X_train['preprocessed_title'].values

tfv = TfidfVectorizer(ngram_range=(2,5), max_features=2000)
X = tfv.fit_transform(title_text).todense()
print(X.shape)

(356465, 2000)


In [24]:
y = pd.Series(y_train).values
print(y)

[2016 2015 2014 ... 2014 2016 2015]


## Logistic Regression Testing (using newton-cg optimizer)


In [27]:
estimator = LogisticRegression(solver='newton-cg')
estimator.fit(X,y)

source_test = ["barack obama elected", "paris attacks", "osama bin laden", 
               "world cup", "lybian civil war", "edward snowden", "bitcoin surge", "ebola outbreak"]

Xtest = tfv.transform(source_test)
pd.DataFrame(estimator.predict_proba(Xtest), columns=["Prob_2008", "Prob_2009", "Prob_2010", "Prob_2011", "Prob_2012",
                                                     "Prob_2013", "Prob_2014", "Prob_2015", "Prob_2016"])

Unnamed: 0,Prob_2008,Prob_2009,Prob_2010,Prob_2011,Prob_2012,Prob_2013,Prob_2014,Prob_2015,Prob_2016
0,0.231237,0.209235,0.030093,0.006137,0.046844,0.073055,0.126996,0.1054,0.171003
1,0.000651,0.00057,0.000406,0.000527,0.000508,0.000577,0.000551,0.84094,0.155271
2,0.055942,0.086314,0.07752,0.391085,0.100468,0.067164,0.061979,0.080097,0.079432
3,0.002493,0.006383,0.065919,0.010625,0.006013,0.106739,0.547792,0.202974,0.051062
4,0.042087,0.022721,0.030087,0.097101,0.128286,0.279229,0.154225,0.128696,0.117568
5,0.001034,0.000784,0.000533,0.000738,0.000674,0.769732,0.160108,0.048029,0.018368
6,0.062876,0.057577,0.041263,0.072942,0.076312,0.168715,0.176515,0.17863,0.165169
7,0.003382,0.002936,0.002104,0.002808,0.053552,0.003724,0.841663,0.071425,0.018406


## Conclusions:
>From the analysis above, we can make the following insights:
>> 1. Check the probability trends to see when news goes "out of date"
>> 2. Predict societal uptrends and downtrends based on news headlines (market, fads, etc.)
>> 3. Predict the likelihood of an event occuring in a certain year

>Improvements to make:
>> 1. If LR model cannot make a prediction, it seems to assign generic probabilities (6, 7, 8, 9)
>>> Possible solutions: 
>>>> Tune hyperparameters of model,
>>>> take a larger subset of the data
>>>> change ML model (SVM, KNN, or LVQ may work better)
>> 2. Runtime for LR model takes about 10 minutes
>>> Possible solutions:
>>>> Tune hyperparameters,
>>>> find a faster optimizer,
>>>> take a smaller subset of the data