# Preparing the Data

In [67]:
import pandas as pd
import numpy as np

data = pd.DataFrame.from_csv('../data/training_data.csv')

replacements = {
   'title_neutrality': {
      None: np.mean(data.title_neutrality)
}}

data=data.replace(replacements)

In [68]:
data.head()

Unnamed: 0,text,title,website,authors,image_url,published_date,typo_counts,text_subjectivity,text_positivity,text_negativity,title_neutrality,fake_or_real
0,Challenges for a nut-free diet\n\nIf you’re co...,Nut-free recipes and information,http://www.bbc.co.uk,,http://static.bbci.co.uk/food/1.37.152/assets/...,,0,0.059809,0.038278,0.043062,1.0,real
1,Lamb in Britain is called lamb if it’s markete...,Lamb recipes,http://www.bbc.co.uk,,http://static.bbci.co.uk/food/1.37.152/assets/...,,0,0.050186,0.05948,0.02974,1.0,real
2,This site is optimised for modern web browsers...,Irish cream and chocolate cheesecake,http://www.bbc.co.uk,"Simon Rimmer, Mary Berry, Rob Burns",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,,1,0.115385,0.153846,0.0,1.0,real
3,Preheat the oven to 180C/350F/Gas 4. Grease an...,Easy chocolate cake,http://www.bbc.co.uk,"Rachel Manley, James Martin, The Hairy Bikers,...",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,,0,0.075949,0.075949,0.044304,0.408,real
4,Preheat the oven to 180C/350F/Gas 4 and lin 2 ...,Chocolate fairy cakes,http://www.bbc.co.uk,"Mary Berry, Sarah Brown, Harvey Bertram-brown",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,,0,0.038462,0.067308,0.009615,1.0,real


In [69]:
import numpy as np

np.random.seed(10101010)

In [75]:
labels_to_vec = np.where(data.fake_or_real == 'fake', 1, 0)
data['label'] = labels_to_vec

# Logistic Regression

In [91]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from patsy import dmatrices

In [92]:
model = LogisticRegression()

In [93]:
training_rows = np.random.randint(0,high=data.shape[0],size=1500)
testing_rows = [row for row in range(data.shape[0]) if row not in training_rows]

y, X = dmatrices('label ~ typo_counts + text_subjectivity + text_positivity + text_negativity + title_neutrality',
                  data.iloc[list(training_rows)], return_type="dataframe")
print(X.columns)
y = np.ravel(y)

model.fit(X,y)

Index(['Intercept', 'typo_counts', 'text_subjectivity', 'text_positivity',
       'text_negativity', 'title_neutrality'],
      dtype='object')


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [94]:
model.score(X,y)

0.57999999999999996

In [95]:
y.mean()

0.5093333333333333

In [96]:
# examine the coefficients
pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))

Unnamed: 0,0,1
0,Intercept,[-0.119705960656]
1,typo_counts,[1.07791605659]
2,text_subjectivity,[1.02779136752]
3,text_positivity,[-0.78790409251]
4,text_negativity,[2.53935725209]
5,title_neutrality,[0.0120311224572]


In [97]:
y_test, X_test = dmatrices('label ~ typo_counts + text_subjectivity + text_positivity + text_negativity + title_neutrality',
                  data.iloc[testing_rows], return_type="dataframe")
print(X_test.columns)
y_test = np.ravel(y_test)

Index(['Intercept', 'typo_counts', 'text_subjectivity', 'text_positivity',
       'text_negativity', 'title_neutrality'],
      dtype='object')


In [103]:
predicted = model.predict(X_test)
probs = model.predict_proba(X_test)
print(probs[:10])

[[ 0.5216661   0.4783339 ]
 [ 0.53671988  0.46328012]
 [ 0.29987135  0.70012865]
 [ 0.55390604  0.44609396]
 [ 0.52900241  0.47099759]
 [ 0.55924301  0.44075699]
 [ 0.29987135  0.70012865]
 [ 0.52518225  0.47481775]
 [ 0.29797374  0.70202626]
 [ 0.53639303  0.46360697]]


In [100]:
print(metrics.accuracy_score(y_test, predicted))
print(metrics.roc_auc_score(y_test, predicted))

0.588178294574
0.584655806424


In [101]:
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegression(), X, y, scoring='accuracy', cv=10)
print( scores)
print( scores.mean())

[ 0.59602649  0.54966887  0.61589404  0.52980132  0.61333333  0.59333333
  0.58389262  0.59731544  0.55033557  0.55704698]
0.578664799917
