# Preparing the Data

In [1]:
import pandas as pd
import numpy as np

data = pd.DataFrame.from_csv('../data/training_data.csv')

replacements = {
   'title_neutrality': {
      None: np.mean(data.title_neutrality)
}}

data=data.replace(replacements)

In [68]:
data.head()

Unnamed: 0,text,title,website,authors,image_url,published_date,typo_counts,text_subjectivity,text_positivity,text_negativity,title_neutrality,fake_or_real
0,Challenges for a nut-free diet\n\nIf you’re co...,Nut-free recipes and information,http://www.bbc.co.uk,,http://static.bbci.co.uk/food/1.37.152/assets/...,,0,0.059809,0.038278,0.043062,1.0,real
1,Lamb in Britain is called lamb if it’s markete...,Lamb recipes,http://www.bbc.co.uk,,http://static.bbci.co.uk/food/1.37.152/assets/...,,0,0.050186,0.05948,0.02974,1.0,real
2,This site is optimised for modern web browsers...,Irish cream and chocolate cheesecake,http://www.bbc.co.uk,"Simon Rimmer, Mary Berry, Rob Burns",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,,1,0.115385,0.153846,0.0,1.0,real
3,Preheat the oven to 180C/350F/Gas 4. Grease an...,Easy chocolate cake,http://www.bbc.co.uk,"Rachel Manley, James Martin, The Hairy Bikers,...",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,,0,0.075949,0.075949,0.044304,0.408,real
4,Preheat the oven to 180C/350F/Gas 4 and lin 2 ...,Chocolate fairy cakes,http://www.bbc.co.uk,"Mary Berry, Sarah Brown, Harvey Bertram-brown",http://ichef.bbci.co.uk/food/ic/food_16x9_448/...,,0,0.038462,0.067308,0.009615,1.0,real


In [3]:
import numpy as np

np.random.seed(10101010)

In [4]:
labels_to_vec = np.where(data.fake_or_real == 'fake', 1, 0)
data['label'] = labels_to_vec

# Logistic Regression

In [5]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.cross_validation import train_test_split
from sklearn import metrics
from sklearn.cross_validation import cross_val_score
from patsy import dmatrices



In [6]:
model = LogisticRegressionCV()

In [93]:
training_rows = np.random.randint(0,high=data.shape[0],size=2500)
testing_rows = [row for row in range(data.shape[0]) if row not in training_rows]

y, X = dmatrices('label ~ typo_counts + text_subjectivity + text_positivity + text_negativity + title_neutrality',
                  data.iloc[list(training_rows)], return_type="dataframe")
print(X.columns)
y = np.ravel(y)

model.fit(X,y)

Index(['Intercept', 'typo_counts', 'text_subjectivity', 'text_positivity',
       'text_negativity', 'title_neutrality'],
      dtype='object')


LogisticRegressionCV(Cs=10, class_weight=None, cv=None, dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
           refit=True, scoring=None, solver='lbfgs', tol=0.0001, verbose=0)

In [94]:
model.score(X,y)

0.61639999999999995

In [95]:
y.mean()

0.50439999999999996

In [96]:
# examine the coefficients
pd.DataFrame(list(zip(X.columns, np.transpose(model.coef_))))

Unnamed: 0,0,1
0,Intercept,[-0.0562611229385]
1,typo_counts,[0.947530474993]
2,text_subjectivity,[-0.548011406455]
3,text_positivity,[-6.54967269649]
4,text_negativity,[13.7991609012]
5,title_neutrality,[0.101244847278]


In [97]:
y_test, X_test = dmatrices('label ~ typo_counts + text_subjectivity + text_positivity + text_negativity + title_neutrality',
                  dict(data.iloc[2]), return_type="dataframe")
print(X_test.columns)
y_test = np.ravel(y_test)

Index(['Intercept', 'typo_counts', 'text_subjectivity', 'text_positivity',
       'text_negativity', 'title_neutrality'],
      dtype='object')


In [103]:
predicted = model.predict(X_test)
probs = model.predict_proba(X_test)
print(probs[:10])

[[ 0.58751097  0.41248903]]


In [100]:
print(metrics.accuracy_score(y_test, predicted))


1.0


In [101]:
# evaluate the model using 10-fold cross-validation
scores = cross_val_score(LogisticRegressionCV(), X, y, scoring='accuracy', cv=10)
print( scores)
print( scores.mean())

[ 0.66135458  0.612       0.636       0.584       0.62        0.564       0.556
  0.648       0.624       0.65461847]
0.615997305557


# Linear SVM

In [53]:
from sklearn import svm

clf = svm.SVC(kernel='linear',C=2000)

In [54]:
clf.fit(X,y)

SVC(C=2000, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [55]:
clf.score(X,y)

0.6048

In [57]:
scores = cross_val_score(svm.SVC(),X, y, scoring='accuracy', cv=10)

In [58]:
print(scores.mean())

0.562795443127


# RandomForest

In [59]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [60]:
clf.fit(X,y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

In [61]:
clf.score(X,y)

0.98240000000000005

In [62]:
scores = cross_val_score(RandomForestClassifier(),X, y, scoring='accuracy', cv=10)

In [64]:
print(scores)

[ 0.95219124  0.956       0.964       0.94        0.948       0.96        0.968
  0.944       0.928       0.95983936]
