In [53]:
import re
import numpy as np
import pandas as pd
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split

fname = open("Political-media-DFE.csv")
df = pd.read_csv(fname, error_bad_lines=False, encoding="latin-1")
new_df = pd.DataFrame()

def cleaner(text):
    try:
        text = re.sub('<[^>]*>', ' ', text)    # removes HTML from tweets
        text = re.sub('(http|https)://[^ ]+ ', '', text)    # removes all the hyperlinks
        text = re.sub('\s\s+', '', text)    # removes all the extra whitespaces
        emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P|[^T_T])', text)    #find all emoticons
        text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace('-', '')  # appends emmoticons at the end.
    except:
        text = text
    try:
        text=unicode(text)
    except:
        pass
    return text

new_df['embed'] = df['embed'].apply(cleaner)
print new_df.head()

def type_to_token(tag):
	if tag=='neutral':
		return 0
	else:
		return 1

for i in range(len((df))):
    new_df.ix[i, 'bias'] = type_to_token(df.ix[i, 'bias'])
    
# Processing into tokens
porter = PorterStemmer()

def tokenizer(text):
    return text.split()

def tokenizer_porter(text):
#     return [porter.stem(word) for word in text.split()]
    for word in text.split():
        try:
            return porter.stem(word)
        except Exception:
            return word

nltk.download("stopwords")
from nltk.corpus import stopwords
stop = stopwords.words('english')

                                               embed
0  rt nowthisnews rep trey radel r fl slams obama...
1  video obamacare full of higher costs and broke...
2  please join me today in remembering our fallen...
3  rt senatorleahy 1st step toward senate debate ...
4   amazon delivery drones show need to update la...


.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate_ix


[nltk_data] Downloading package stopwords to /home/pc/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
X_train, X_test, y_train, y_test = train_test_split(new_df['embed'], new_df['bias'], test_size=0.33, random_state=42) 

print X_test.head()
classes = np.array([0,1])
vect = HashingVectorizer(decode_error='ignore',
                         n_features = 2**21,
                         preprocessor = None,
                         tokenizer=tokenizer
                        )
clf = SGDClassifier(loss='log', random_state=1, n_iter=1)
X_train = vect.transform(X_train)
clf.partial_fit(X_train, y_train, classes=classes)	

def token_to_type(num):
	if num == 1:
		return "partial"
	return "bias"

# print("The prediction : {}".format(token_to_type(clf.predict(X_test[]))))
X_test = vect.transform(X_test)
print('Accuracy: {:.2f}%'.format(clf.score(X_test, y_test)*100))


1501    speaking on the house floor tonight to honor m...
2586     function d s id var js fjs d getelementsbytag...
2653     function d s id var js fjs d getelementsbytag...
1055    as a social worker i know impact a teacher can...
705     rt robrobinson way to go nashville via usatoda...
Name: embed, dtype: object
Accuracy: 73.82%


In [60]:
from sklearn.neural_network import MLPClassifier

nn_clf = MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(5, 2), random_state=1)
nn_clf.fit(X_train, y_train)
print('Accuracy: {:.2f}%'.format(nn_clf.score(X_test, y_test)*100))


Accuracy: 73.27%


In [61]:
# print X_test[4] 
print nn_clf.predict(X_test[4])

[ 0.]


In [65]:
# Added version check for recent scikit-learn 0.18 checks
from distutils.version import LooseVersion as Version
from sklearn import __version__ as sklearn_version

X_train, X_test, y_train, y_test = train_test_split(new_df['embed'], new_df['bias'], test_size=0.33, random_state=42) 

from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
if Version(sklearn_version) < '0.18':
    from sklearn.grid_search import GridSearchCV
else:
    from sklearn.model_selection import GridSearchCV

tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              {'vect__ngram_range': [(1, 1)],
               'vect__stop_words': [stop, None],
               'vect__tokenizer': [tokenizer, tokenizer_porter],
               'vect__use_idf':[False],
               'vect__norm':[None],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
              ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=0))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                           scoring='accuracy',
                           cv=5,
                           verbose=1,
                           n_jobs=-1,
                           error_score=5,
                          )

In [66]:
gs_lr_tfidf.fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    1.6s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:   10.5s
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed:   13.3s finished


GridSearchCV(cv=5, error_score=5,
       estimator=Pipeline(steps=[('vect', TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=False, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=Tru...nalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))]),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid=[{'vect__ngram_range': [(1, 1)], 'vect__tokenizer': [<function tokenizer at 0x7f0a78304ed8>, <function tokenizer_porter at 0x7f0a78304b18>], 'clf__penalty': ['l1', 'l2'], 'clf__C': [1.0, 10.0, 100.0], 'vect__stop_words': [[u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves...0a78304b18>], 'vect__use_idf': [False], 'clf__C': [1.0, 10.0, 100.0], 'clf__penalty': ['l1', 'l2']}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       sco

In [67]:
lgr_clf = gs_lr_tfidf.best_estimator_
print('Accuracy: {:.2f}%'.format(lgr_clf.score(X_test, y_test)*100))


Accuracy: 76.48%


In [71]:
import pickle, os

dest = os.path.join('pkl_objects')
if not os.path.exists(dest):
	os.makedirs(dest)

pickle.dump(
	clf, open(os.path.join(dest, 'classifier.pkl'), 'wb')
	)