In [151]:
import numpy as np
import pandas as pd
import nltk
from nltk import word_tokenize
import re
import matplotlib.pyplot as plt
import string
from nltk.corpus import stopwords
from sklearn.cross_validation import train_test_split
from sklearn.linear_model import LinearRegression
print('imported necessary libraries')
print('')


stories=pd.read_csv('G:\\stories.csv',names=['id', 'created_at', 'created_at_i', 'author', 'points', 'url_hostname', 'num_comments', 'title'])
row_subset=20000
stories=stories.loc[0:row_subset,:]
print('\'stories\' dataframe has {} rows from the stories.csv file'.format(row_subset))
print('')
print('first 5 rows for inspection')
print('')
print(stories.head())
print('')


print('data sense checks...')
if(stories['id'].nunique()!=stories.shape[0]):
    print('story id column not unique')
else:
    print('story id column unique')
print('')
if(stories.count().min()!=stories.shape[0]):
    print('missing values present')
    print('columns and number of missing values:')
    print(stories.shape[0]-stories.count())
    print('')
    stories=stories.dropna(subset=['title'])
    print('dropped all rows where title was NULL')
    print('ignore rows where url_hostname is NULL')
else:
    print('no missing values present')
print('')


stories.loc[:,'title']=stories['title'].str.lower()
regex_comp = re.compile('[%s]' % re.escape(string.punctuation))
stories.loc[:,'title']=stories['title'].apply((lambda row: regex_comp.sub('',row)))
print('converted all title text to lower case and removed all punctuations')
print('')
print('5 rows for inspection')
print('')
print(stories.head())
print('')
stories['tokenized_title']=stories['title'].apply(lambda row: word_tokenize(row))
print('tokenized title into individual words in a new column - tokenized title')
print('')
word_counts=pd.Series(name='word_counts')
for tokenized_title in stories['tokenized_title']:
    for word in tokenized_title:
        if(re.match('\w+',word)):
            if(word not in word_counts):
                word_counts[word]=1
            else:
                word_counts[word]+=1
print('compiled series of distinct words and their counts in series word_counts')
print('')
print('first 5 rows of word_counts ')
print('')
print(word_counts.head(5))
word_counts=word_counts[(word_counts>1)]
print('removed words that occur only once')
print('')
df_stopwords=pd.DataFrame(stopwords.words('english'),columns=['word'])
df_word_counts=pd.DataFrame({'word':word_counts.index,'counts':word_counts.values})
joined_df=pd.merge(df_word_counts,df_stopwords,how='left',on='word',suffixes=('_wc', '_sw'),  indicator=True)
joined_df=joined_df.loc[joined_df['_merge']=='left_only',:]
print('removed stop words \(default stopwords from nltk\) from word_counts')
print('')
joined_df=joined_df.loc[joined_df['counts']>19,:]
print('removed words that repeat less than 20 times')
print('')
no_of_words=len(joined_df)
print('we now have {} words to use as features'.format(no_of_words))

column_names=list(joined_df['word'].unique())
counts_df=pd.DataFrame(0,index=np.arange(len(stories)),columns=column_names)
print('created dataframe \'counts_df\' with the {} words/features as columns to implement bag of words model')
print('')

tokenized_titles=list(stories['tokenized_title'])
for index,title_list in enumerate(tokenized_titles):
    for word in title_list:
        if(word in column_names):
            counts_df[word].iloc[index]+=1
print('first 5 rows of the \'counts_df\' dataframe for inspection' )

X_train, X_test, y_train, y_test = train_test_split(counts_df, stories["points"], test_size=0.2, random_state=1)
print('split \'counts_df\' into 80% train and 20% test sets')
print('')
clf = LinearRegression()
clf.fit(X_train,y_train)
print('trained linear regression model')
print('')
predictions=clf.predict(X_test)
print('predicted upvotes for the test set')
print('')
diff=(y_test-predictions)**2
mse=diff.sum(axis=0)/len(diff)
print('Root mean error is:')
print((mse)**(1/2))
print('')



imported necessary libraries

'stories' dataframe has 20000 rows from the stories.csv file

first 5 rows for inspection

        id                created_at  created_at_i        author  points  \
0  9079978  2015-02-20T11:29:58.000Z    1424431798    Immortalin       2   
1  9079983  2015-02-20T11:34:22.000Z    1424432062     Rutger24s       1   
2  9079986  2015-02-20T11:35:32.000Z    1424432132  AndrewDucker       3   
3  9079988  2015-02-20T11:36:18.000Z    1424432178     davidiach       1   
4  9080000  2015-02-20T11:41:06.000Z    1424432466       CiaranR       1   

          url_hostname  num_comments  \
0                  NaN             0   
1  startupjuncture.com             0   
2   blog.erratasec.com             0   
3          twitter.com             0   
4  phpconference.co.uk             0   

                                               title  
0       Ask HN: Simple SaaS as first Golang web app?  
1   24sessions: live business advice over video-chat  
2               