# Introduction to Natural Language Processing

The data set consists of submissions users made to Hacker News from 2006 to 2015, with the following columns:
- `submission_time` - When the article was submitted
- `upvotes` - The number of upvotes the article received
- `url` - The base URL of the article
- `headline` - The article's headline

We'll be predicting the number of upvotes the articles received, based on their headlines.

In [31]:
import pandas as pd
import numpy as np

### Exploring the Data

In [3]:
submissions = pd.read_csv("sel_hn_stories.csv")

In [4]:
submissions.columns = ["submission_time", "upvotes", "url", "headline"]

In [5]:
submissions = submissions.dropna()

In [6]:
submissions.shape

(2800, 4)

In [7]:
submissions.head()

Unnamed: 0,submission_time,upvotes,url,headline
0,2010-02-17T16:57:59Z,1,blog.jonasbandi.net,Software: Sadly we did adopt from the construc...
1,2014-02-04T02:36:30Z,1,blogs.wsj.com,Google’s Stock Split Means More Control for L...
2,2011-10-26T07:11:29Z,1,threatpost.com,SSL DOS attack tool released exploiting negoti...
3,2011-04-03T15:43:44Z,67,algorithm.com.au,Immutability and Blocks Lambdas and Closures
4,2013-01-13T16:49:20Z,1,winmacsofts.com,Comment optimiser la vitesse de Wordpress?


### Tokenizing the headlines

In [11]:
tokenized_headlines = []

for item in submissions["headline"]:
    tokenized_headlines.append(item.split())

In [12]:
tokenized_headlines[:5]

[['Software:',
  'Sadly',
  'we',
  'did',
  'adopt',
  'from',
  'the',
  'construction',
  'analogy'],
 ['Google’s',
  'Stock',
  'Split',
  'Means',
  'More',
  'Control',
  'for',
  'Larry',
  'and',
  'Sergey'],
 ['SSL',
  'DOS',
  'attack',
  'tool',
  'released',
  'exploiting',
  'negotiation',
  'overhead'],
 ['Immutability', 'and', 'Blocks', 'Lambdas', 'and', 'Closures'],
 ['Comment', 'optimiser', 'la', 'vitesse', 'de', 'Wordpress?']]

### Preprocessing tokens to increase accuracy

In [13]:
punctuation = [",", ":", ";", ".", "'", '"', "’", "?", "/", "-", "+", "&", "(", ")"]

In [14]:
clean_tokenized = []

for item in tokenized_headlines:
    tokens = []
    for token in item:
        token = token.lower()
        for p in punctuation:
            token = token.replace(p, "")
        tokens.append(token)
    clean_tokenized.append(tokens)
    
clean_tokenized[:5]

[['software',
  'sadly',
  'we',
  'did',
  'adopt',
  'from',
  'the',
  'construction',
  'analogy'],
 ['googles',
  'stock',
  'split',
  'means',
  'more',
  'control',
  'for',
  'larry',
  'and',
  'sergey'],
 ['ssl',
  'dos',
  'attack',
  'tool',
  'released',
  'exploiting',
  'negotiation',
  'overhead'],
 ['immutability', 'and', 'blocks', 'lambdas', 'and', 'closures'],
 ['comment', 'optimiser', 'la', 'vitesse', 'de', 'wordpress']]

### Creating a matrix of unique words

In [16]:
unique_tokens = []
single_tokens = []

for tokens in clean_tokenized:
    for token in tokens:
        if token not in single_tokens:
            single_tokens.append(token)
        elif token in single_tokens and token not in unique_tokens:
            unique_tokens.append(token)

In [20]:
unique_tokens[:20]

['and',
 'for',
 'as',
 'you',
 'is',
 'the',
 'split',
 'good',
 'how',
 'what',
 '',
 'of',
 'de',
 'in',
 'a',
 'with',
 'amazon',
 'cloud',
 'at',
 'google']

In [21]:
single_tokens[:10]

['software',
 'sadly',
 'we',
 'did',
 'adopt',
 'from',
 'the',
 'construction',
 'analogy',
 'googles']

In [18]:
counts = pd.DataFrame(0, index=np.arange(len(clean_tokenized)), columns = unique_tokens)
counts.head()

Unnamed: 0,and,for,as,you,is,the,split,good,how,what,...,frameworks,animated,walks,auctions,clouds,hammer,autonomous,vehicle,crowdsourcing,disaster
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Counting tokens

In [23]:
for i, item in enumerate(clean_tokenized):
    for token in item:
        if token in unique_tokens:
            counts.iloc[i][token] += 1

In [25]:
counts.head(10)

Unnamed: 0,and,for,as,you,is,the,split,good,how,what,...,frameworks,animated,walks,auctions,clouds,hammer,autonomous,vehicle,crowdsourcing,disaster
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,1,2,2,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


### Removing extra columns to increase accuracy

In [27]:
word_counts = counts.sum(axis=0)
word_counts

and              289
for              298
as                47
you              100
is               158
the              604
split              2
good              13
how              140
what              62
                 192
of               342
de                 9
in               276
a                336
with             158
amazon            19
cloud             18
at                59
google            83
to               477
status             2
back              14
raises            12
faster             4
an                73
on               167
2014               8
out               39
show              64
                ... 
adapter            2
diversity          2
asking             3
link               2
deploying          2
plate              2
healthcare         2
term               2
gist               2
saving             2
devops             2
improved           2
practical          2
celebrate          2
thomas             2
sabo               2
club         

In [29]:
counts = counts.loc[:,(word_counts >= 5) & (word_counts <= 100)]
counts.head()

Unnamed: 0,as,you,good,what,de,amazon,cloud,at,google,back,...,uk,preview,compiler,manager,sharing,sale,competition,diet,reasons,nike
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Splitting the Data into train and test sets

In [None]:
from sklearn.model_selection import train_test_split

In [32]:
X_train, X_test, y_train, y_test = train_test_split(counts, submissions["upvotes"], test_size=0.2, random_state=1)

In [33]:
X_train.shape

(2240, 661)

In [35]:
y_train.shape

(2240,)

In [34]:
X_test.shape

(560, 661)

In [36]:
y_test.shape

(560,)

### Making Predictions

In [37]:
from sklearn.linear_model import LinearRegression

In [38]:
lr = LinearRegression()
lr.fit(X_train,y_train)
predictions = lr.predict(X_test)

In [39]:
mse = ((predictions-y_test)**2).sum()/len(predictions)
mse

2651.1457056689651