In [4170]:
from sklearn.naive_bayes import MultinomialNB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.externals import joblib
from sklearn.model_selection import GridSearchCV
# plt.style.use('ggplot')
# % matplotlib inline

In [4171]:
df = pd.read_csv('CrisisLexT26/2012_Colorado_wildfires/2012_Colorado_wildfires-tweets_labeled.csv') # change the file location if needed

col_dict = {}
for old_feature in df.columns.values:
    col_dict.update({old_feature: old_feature.replace(' ','')})
df.rename(columns=col_dict, inplace=True)

df.Informativeness.value_counts()

Related and informative          685
Related - but not informative    268
Not related                      238
Not applicable                     9
Name: Informativeness, dtype: int64

In [4172]:
df = df[df.InformationSource != 'Government']
df = df[df.Informativeness!='Not applicable'].reset_index(drop=True)
df.Informativeness.value_counts()

Related and informative          622
Related - but not informative    257
Not related                      238
Name: Informativeness, dtype: int64

In [4173]:
label = ['Not related or not informative'] * df.shape[0]
idx = df.index[df.Informativeness == 'Related and informative'].tolist()
for i in idx: label[i] = 'Related and informative'
df['label'] = pd.Series(label)

df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label
0,211040709124440064,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...,Not labeled,Not labeled,Not related,Not related or not informative
1,211111710294163457,RT @Jack4Ward: Get in on the fun every Thursda...,Not labeled,Not labeled,Not related,Not related or not informative
2,211157222699433985,Welcome to our newest STUDENTathlete- Reagan B...,Not labeled,Not labeled,Not related,Not related or not informative
3,211162553659830272,Denver Post: #Colorado governor signs bill cre...,Not labeled,Not labeled,Not related,Not related or not informative
4,211216962162933761,Pretty sure I'm going to live in Manitou Sprin...,Not labeled,Not labeled,Not related,Not related or not informative


In [4174]:
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import regexp_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import HashingVectorizer
import preprocessor as p

emoji_re = "['\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF']"
emojis = [regexp_tokenize(t, emoji_re) for t in df.TweetText]
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)
all_tweets = [p.clean(t).lower() for t in df.TweetText]

tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in all_tweets]
# en_stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
processed_texts = []  # preprocessed tweets

for i in range(len(all_tokens)):
    processed_texts.append(' '.join([lemmatizer.lemmatize(t) for t in all_tokens[i]]))

df['ProcessedText'] = pd.Series(processed_texts)
df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label,ProcessedText
0,211040709124440064,#Intern #US #TATTOO #Wisconsin #Ohio #NC #PA #...,Not labeled,Not labeled,Not related,Not related or not informative,#intern #us #tattoo #wisconsin #ohio #nc #pa #...
1,211111710294163457,RT @Jack4Ward: Get in on the fun every Thursda...,Not labeled,Not labeled,Not related,Not related or not informative,rt @jack4ward : get in on the fun every thursd...
2,211157222699433985,Welcome to our newest STUDENTathlete- Reagan B...,Not labeled,Not labeled,Not related,Not related or not informative,welcome to our newest studentathlete - reagan ...
3,211162553659830272,Denver Post: #Colorado governor signs bill cre...,Not labeled,Not labeled,Not related,Not related or not informative,denver post : #colorado governor sign bill cre...
4,211216962162933761,Pretty sure I'm going to live in Manitou Sprin...,Not labeled,Not labeled,Not related,Not related or not informative,pretty sure i'm going to live in manitou sprin...


In [4175]:
split_percentage = 0.25
X_train2, X_test2, y_train2, y_test2 = train_test_split(df.ProcessedText, df.label, test_size=split_percentage, random_state=42) 
hash_pp_vec = HashingVectorizer(analyzer='word', non_negative=True)
X_hash_pp_train = hash_pp_vec.fit_transform(X_train2)  
X_hash_pp_test = hash_pp_vec.transform(X_test2)



In [4176]:
mnb_hash_pp = MultinomialNB(0.1, False)
mnb_hash_pp.fit(X_hash_pp_train, y_train2)

# mnb_hash_pp = MultinomialNB()
# parameters = {'fit_prior':('True', 'False'), 'alpha':list(np.arange(0.1, 2, 0.1))}
# clf = GridSearchCV(mnb_hash_pp, parameters)
# clf.fit(X_hash_pp_train, y_train2)
# mnb_hash_pp.fit(X_hash_pp_train, y_train2)
# mnb_hash_pp = clf.best_estimator_
# print clf.best_params_


from sklearn.cross_validation import cross_val_score, KFold 
from scipy.stats import sem 

def evaluate_cross_validation(clf, X, y, K):
    cv = KFold(len(y), K, shuffle=True, random_state=0)  
    scores = cross_val_score(clf, X, y, cv=cv) 
    print scores
    print ("Mean score: {0:.4f} (+/-{1:.4f})").format(np.mean(scores), sem(scores))

evaluate_cross_validation(mnb_hash_pp, X_hash_pp_train, y_train2, 10)

print 'Accuracy for training: {}'.format(mnb_hash_pp.score(X_hash_pp_train, y_train2))
print 'Accuracy for testing: {}'.format(mnb_hash_pp.score(X_hash_pp_test, y_test2))

[0.78571429 0.85714286 0.73809524 0.8452381  0.85714286 0.72619048
 0.82142857 0.77108434 0.71084337 0.84337349]
Mean score: 0.7956 (+/-0.0179)
Accuracy for training: 0.972520908005
Accuracy for testing: 0.789285714286


In [4177]:
y_hash_pp_predict = mnb_hash_pp.predict(X_hash_pp_test) 
print "Classification Report:" 
print metrics.classification_report(y_test2,y_hash_pp_predict,digits=4) 
print "Confusion Matrix:" 
print metrics.confusion_matrix(y_test2,y_hash_pp_predict)

Classification Report:
                                precision    recall  f1-score   support

Not related or not informative     0.9333    0.6131    0.7401       137
       Related and informative     0.7211    0.9580    0.8228       143

                   avg / total     0.8249    0.7893    0.7823       280

Confusion Matrix:
[[ 84  53]
 [  6 137]]


In [4178]:
y_hash_pp_predict

array(['Related and informative', 'Related and informative',
       'Related and informative', 'Not related or not informative',
       'Not related or not informative', 'Related and informative',
       'Not related or not informative', 'Related and informative',
       'Related and informative', 'Not related or not informative',
       'Not related or not informative', 'Related and informative',
       'Not related or not informative', 'Related and informative',
       'Related and informative', 'Related and informative',
       'Related and informative', 'Related and informative',
       'Related and informative', 'Not related or not informative',
       'Related and informative', 'Related and informative',
       'Not related or not informative', 'Related and informative',
       'Not related or not informative', 'Related and informative',
       'Not related or not informative', 'Not related or not informative',
       'Related and informative', 'Not related or not informative',
 

In [4179]:
mnb_hash_pp.predict_proba(X_hash_pp_test)

array([[7.77340927e-02, 9.22265907e-01],
       [4.94231612e-01, 5.05768388e-01],
       [1.53396484e-01, 8.46603516e-01],
       [7.38261596e-01, 2.61738404e-01],
       [5.11806249e-01, 4.88193751e-01],
       [4.26884489e-03, 9.95731155e-01],
       [7.68225188e-01, 2.31774812e-01],
       [1.90460655e-01, 8.09539345e-01],
       [1.90184980e-02, 9.80981502e-01],
       [5.80285103e-01, 4.19714897e-01],
       [8.31354149e-01, 1.68645851e-01],
       [2.44262907e-01, 7.55737093e-01],
       [6.82393023e-01, 3.17606977e-01],
       [3.75019923e-01, 6.24980077e-01],
       [3.35346535e-01, 6.64653465e-01],
       [1.91803232e-01, 8.08196768e-01],
       [8.01199725e-02, 9.19880027e-01],
       [1.83341271e-01, 8.16658729e-01],
       [1.96778211e-02, 9.80322179e-01],
       [6.58650726e-01, 3.41349274e-01],
       [9.89312781e-03, 9.90106872e-01],
       [8.07883008e-02, 9.19211699e-01],
       [6.58306708e-01, 3.41693292e-01],
       [1.30801895e-01, 8.69198105e-01],
       [5.338970

In [4180]:
joblib.dump(mnb_hash_pp, 'original.pkl') 

['original.pkl']

## Colorado floods

In [4181]:
df = pd.read_csv('CrisisLexT26/2013_Colorado_floods/2013_Colorado_floods-tweets_labeled.csv') # change the file location if needed

col_dict = {}
for old_feature in df.columns.values:
    col_dict.update({old_feature: old_feature.replace(' ','')})
df.rename(columns=col_dict, inplace=True)

df.Informativeness.value_counts()

Related and informative          768
Related - but not informative    157
Not related                       70
Not applicable                     5
Name: Informativeness, dtype: int64

In [4182]:
df = df[df.InformationSource != 'Government']
df = df[df.Informativeness!='Not applicable'].reset_index(drop=True)
df.Informativeness.value_counts()

Related and informative          691
Related - but not informative    156
Not related                       70
Name: Informativeness, dtype: int64

In [4183]:
label = ['Not related or not informative'] * df.shape[0]
idx = df.index[df.Informativeness == 'Related and informative'].tolist()
for i in idx: label[i] = 'Related and informative'
df['label'] = pd.Series(label)

df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label
0,376843697943769088,#Longmont #CO The Tiny Tim Center is now #hiri...,Not labeled,Not labeled,Not related,Not related or not informative
1,378011169883037697,"RT @dlfluegge: Crazy Flooding in Boulder, Colo...",Media,Sympathy and support,Related - but not informative,Not related or not informative
2,378020179214491649,Here's the #boulderflood video that's circulat...,Outsiders,Other Useful Information,Related and informative,Related and informative
3,378026101588496385,RT @passantino: Video: Severe flooding hits ne...,Media,Other Useful Information,Related and informative,Related and informative
4,378029784204206080,"Crazy Flooding in Boulder, Colorado http://t.c...",Media,Other Useful Information,Related and informative,Related and informative


In [4184]:
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)
all_tweets = [p.clean(t).lower() for t in df.TweetText]

tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in all_tweets]
#en_stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
processed_texts = []  # preprocessed tweets

for i in range(len(all_tokens)):
    processed_texts.append(' '.join([lemmatizer.lemmatize(t) for t in all_tokens[i]]))

df['ProcessedText'] = pd.Series(processed_texts)
# df = df.sample(frac=1)
df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label,ProcessedText
0,376843697943769088,#Longmont #CO The Tiny Tim Center is now #hiri...,Not labeled,Not labeled,Not related,Not related or not informative,#longmont #co the tiny tim center is now #hiri...
1,378011169883037697,"RT @dlfluegge: Crazy Flooding in Boulder, Colo...",Media,Sympathy and support,Related - but not informative,Not related or not informative,"rt @dlfluegge : crazy flooding in boulder , co..."
2,378020179214491649,Here's the #boulderflood video that's circulat...,Outsiders,Other Useful Information,Related and informative,Related and informative,here's the #boulderflood video that's circulat...
3,378026101588496385,RT @passantino: Video: Severe flooding hits ne...,Media,Other Useful Information,Related and informative,Related and informative,rt @passantino : video : severe flooding hit n...
4,378029784204206080,"Crazy Flooding in Boulder, Colorado http://t.c...",Media,Other Useful Information,Related and informative,Related and informative,"crazy flooding in boulder , colorado"


In [4185]:
step = 1
threshold = 0.6
sublen = df.shape[0]/step
marks = [i*sublen for i in range(step)]
marks.append(df.shape[0]-1)
for i in range(step):
    test = df.ProcessedText[marks[i]:marks[i+1]]
    test_hash = hash_pp_vec.transform(test)
    predict_hash = mnb_hash_pp.predict(test_hash)
    probability = mnb_hash_pp.predict_proba(test_hash)
    new_text_list = []
    new_label_list = []
    for j in range(test.shape[0]):
        if probability[j][0] >= threshold or probability[j][1] >= threshold:
            new_text_list.append(test.values[j])
            # new_label_list.append(y_hash_pp_predict[j])
            new_label_list.append(predict_hash[j])
    new_text = np.asarray(new_text_list)
    new_label = np.asarray(new_label_list)
    new_text_hash = hash_pp_vec.transform(new_text)
    mnb_hash_pp.partial_fit(new_text_hash, new_label)



In [4186]:
X_hash_pp_test = hash_pp_vec.transform(df.ProcessedText)

y_hash_pp_predict = mnb_hash_pp.predict(X_hash_pp_test)

print "Classification Report:" 
print metrics.classification_report(df.label,y_hash_pp_predict, digits=4) 
print "Confusion Matrix:" 
print metrics.confusion_matrix(df.label,y_hash_pp_predict)
print "Accuracy:"
print metrics.accuracy_score(df.label,y_hash_pp_predict)

Classification Report:
                                precision    recall  f1-score   support

Not related or not informative     0.7527    0.3097    0.4389       226
       Related and informative     0.8107    0.9667    0.8818       691

                   avg / total     0.7964    0.8048    0.7727       917

Confusion Matrix:
[[ 70 156]
 [ 23 668]]
Accuracy:
0.8047982551799345




## Australia fire

In [4187]:
df = pd.read_csv('CrisisLexT26/2013_Australia_bushfire/2013_Australia_bushfire-tweets_labeled.csv') # change the file location if needed

col_dict = {}
for old_feature in df.columns.values:
    col_dict.update({old_feature: old_feature.replace(' ','')})
df.rename(columns=col_dict, inplace=True)

df.Informativeness.value_counts()

Related and informative          704
Related - but not informative    245
Not related                      242
Not applicable                     8
Name: Informativeness, dtype: int64

In [4188]:
df = df[df.InformationSource != 'Government']
df = df[df.Informativeness!='Not applicable'].reset_index(drop=True)
df.Informativeness.value_counts()

Related and informative          602
Not related                      242
Related - but not informative    235
Name: Informativeness, dtype: int64

In [4189]:
label = ['Not related or not informative'] * df.shape[0]
idx = df.index[df.Informativeness == 'Related and informative'].tolist()
for i in idx: label[i] = 'Related and informative'
df['label'] = pd.Series(label)

df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label
0,388923108922781697,"It's ridiculous,,,RT @ThatGuyTiisetso: #redoct...",Not labeled,Not labeled,Not related,Not related or not informative
1,388929052247347200,A Total Fire Ban has been declared for the Gre...,Media,Caution and advice,Related and informative,Related and informative
2,388970580026089472,This #RedOctober shit is just the beginning of...,Not labeled,Not labeled,Not related,Not related or not informative
3,388972203242704896,You can't be supporting #RedOctober mr white m...,Not labeled,Not labeled,Not related,Not related or not informative
4,388986124107927553,RT @smh: Do not light fires in the open: RFS w...,Media,Caution and advice,Related and informative,Related and informative


In [4190]:
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)
all_tweets = [p.clean(t).lower() for t in df.TweetText]

tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in all_tweets]
#en_stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
processed_texts = []  # preprocessed tweets

for i in range(len(all_tokens)):
    processed_texts.append(' '.join([lemmatizer.lemmatize(t) for t in all_tokens[i]]))

df['ProcessedText'] = pd.Series(processed_texts)
# df = df.sample(frac=1)
df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label,ProcessedText
0,388923108922781697,"It's ridiculous,,,RT @ThatGuyTiisetso: #redoct...",Not labeled,Not labeled,Not related,Not related or not informative,"it's ridiculous , , , rt @thatguytiisetso : #r..."
1,388929052247347200,A Total Fire Ban has been declared for the Gre...,Media,Caution and advice,Related and informative,Related and informative,a total fire ban ha been declared for the grea...
2,388970580026089472,This #RedOctober shit is just the beginning of...,Not labeled,Not labeled,Not related,Not related or not informative,this #redoctober shit is just the beginning of...
3,388972203242704896,You can't be supporting #RedOctober mr white m...,Not labeled,Not labeled,Not related,Not related or not informative,you can't be supporting #redoctober mr white m...
4,388986124107927553,RT @smh: Do not light fires in the open: RFS w...,Media,Caution and advice,Related and informative,Related and informative,rt @smh : do not light fire in the open : rf w...


In [4191]:
mnb_hash_pp = joblib.load('original.pkl') 
step = 1
threshold = 0.81
sublen = df.shape[0]/step
marks = [i*sublen for i in range(step)]
marks.append(df.shape[0]-1)
for i in range(step):
    test = df.ProcessedText[marks[i]:marks[i+1]]
    test_hash = hash_pp_vec.transform(test)
    predict_hash = mnb_hash_pp.predict(test_hash)
    probability = mnb_hash_pp.predict_proba(test_hash)
    new_text_list = []
    new_label_list = []
    for j in range(test.shape[0]):
        if probability[j][0] >= threshold or probability[j][1] >= threshold:
            new_text_list.append(test.values[j])
            # new_label_list.append(y_hash_pp_predict[j])
            new_label_list.append(predict_hash[j])
    new_text = np.asarray(new_text_list)
    new_label = np.asarray(new_label_list)
    new_text_hash = hash_pp_vec.transform(new_text)
    mnb_hash_pp.partial_fit(new_text_hash, new_label)



In [4192]:
X_hash_pp_test = hash_pp_vec.transform(df.ProcessedText)

y_hash_pp_predict = mnb_hash_pp.predict(X_hash_pp_test)

print "Classification Report:" 
print metrics.classification_report(df.label,y_hash_pp_predict, digits=4) 
print "Confusion Matrix:" 
print metrics.confusion_matrix(df.label,y_hash_pp_predict)
print "Accuracy:"
print metrics.accuracy_score(df.label,y_hash_pp_predict)

Classification Report:
                                precision    recall  f1-score   support

Not related or not informative     0.9664    0.4822    0.6434       477
       Related and informative     0.7063    0.9867    0.8233       602

                   avg / total     0.8213    0.7637    0.7437      1079

Confusion Matrix:
[[230 247]
 [  8 594]]
Accuracy:
0.7636700648748842




## Australia flood

In [4193]:
df = pd.read_csv('CrisisLexT26/2013_Queensland_floods/2013_Queensland_floods-tweets_labeled.csv') # change the file location if needed

col_dict = {}
for old_feature in df.columns.values:
    col_dict.update({old_feature: old_feature.replace(' ','')})
df.rename(columns=col_dict, inplace=True)

df.Informativeness.value_counts()

Related and informative          728
Not related                      261
Related - but not informative    191
Not applicable                    20
Name: Informativeness, dtype: int64

In [4194]:
df = df[df.InformationSource != 'Government']
df = df[df.Informativeness!='Not applicable'].reset_index(drop=True)
df.Informativeness.value_counts()

Related and informative          665
Not related                      261
Related - but not informative    186
Name: Informativeness, dtype: int64

In [4195]:
label = ['Not related or not informative'] * df.shape[0]
idx = df.index[df.Informativeness == 'Related and informative'].tolist()
for i in idx: label[i] = 'Related and informative'
df['label'] = pd.Series(label)

df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label
0,291852896990023680,RT @AdmireAriana: Heat wave in Australia. Floo...,Not labeled,Not labeled,Not related,Not related or not informative
1,291853232538537984,RT @AdmireAriana: Heat wave in Australia. Floo...,Outsiders,Other Useful Information,Related - but not informative,Not related or not informative
2,291921947787395072,Jakarta floods leave hundreds of thousands hom...,Media,Not applicable,Related and informative,Related and informative
3,292248541500420096,"Bush fires in Australia, deadly flooding in In...",Outsiders,Other Useful Information,Related and informative,Related and informative
4,293071229055799297,VIDEO - A world of contrasts. We freeze whilst...,Outsiders,Affected individuals,Related - but not informative,Not related or not informative


In [4196]:
p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.SMILEY)
all_tweets = [p.clean(t).lower() for t in df.TweetText]

tknzr = TweetTokenizer()
all_tokens = [tknzr.tokenize(t) for t in all_tweets]
#en_stop = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
processed_texts = []  # preprocessed tweets

for i in range(len(all_tokens)):
    processed_texts.append(' '.join([lemmatizer.lemmatize(t) for t in all_tokens[i]]))

df['ProcessedText'] = pd.Series(processed_texts)
# df = df.sample(frac=1)
df.head()

Unnamed: 0,TweetID,TweetText,InformationSource,InformationType,Informativeness,label,ProcessedText
0,291852896990023680,RT @AdmireAriana: Heat wave in Australia. Floo...,Not labeled,Not labeled,Not related,Not related or not informative,rt @admireariana : heat wave in australia . fl...
1,291853232538537984,RT @AdmireAriana: Heat wave in Australia. Floo...,Outsiders,Other Useful Information,Related - but not informative,Not related or not informative,rt @admireariana : heat wave in australia . fl...
2,291921947787395072,Jakarta floods leave hundreds of thousands hom...,Media,Not applicable,Related and informative,Related and informative,jakarta flood leave hundred of thousand homele...
3,292248541500420096,"Bush fires in Australia, deadly flooding in In...",Outsiders,Other Useful Information,Related and informative,Related and informative,"bush fire in australia , deadly flooding in in..."
4,293071229055799297,VIDEO - A world of contrasts. We freeze whilst...,Outsiders,Affected individuals,Related - but not informative,Not related or not informative,video - a world of contrast . we freeze whilst...


In [4197]:
mnb_hash_pp = joblib.load('original.pkl') 
step = 1
threshold = 0.73
sublen = df.shape[0]/step
marks = [i*sublen for i in range(step)]
marks.append(df.shape[0]-1)
for i in range(step):
    test = df.ProcessedText[marks[i]:marks[i+1]]
    test_hash = hash_pp_vec.transform(test)
    predict_hash = mnb_hash_pp.predict(test_hash)
    probability = mnb_hash_pp.predict_proba(test_hash)
    new_text_list = []
    new_label_list = []
    for j in range(test.shape[0]):
        if probability[j][0] >= threshold or probability[j][1] >= threshold:
            new_text_list.append(test.values[j])
            # new_label_list.append(y_hash_pp_predict[j])
            new_label_list.append(predict_hash[j])
    new_text = np.asarray(new_text_list)
    new_label = np.asarray(new_label_list)
    new_text_hash = hash_pp_vec.transform(new_text)
    mnb_hash_pp.partial_fit(new_text_hash, new_label)



In [4198]:
X_hash_pp_test = hash_pp_vec.transform(df.ProcessedText)

y_hash_pp_predict = mnb_hash_pp.predict(X_hash_pp_test)

print "Classification Report:" 
print metrics.classification_report(df.label,y_hash_pp_predict, digits=4) 
print "Confusion Matrix:" 
print metrics.confusion_matrix(df.label,y_hash_pp_predict)
print "Accuracy:"
print metrics.accuracy_score(df.label,y_hash_pp_predict)

Classification Report:
                                precision    recall  f1-score   support

Not related or not informative     0.8386    0.5928    0.6946       447
       Related and informative     0.7714    0.9233    0.8405       665

                   avg / total     0.7984    0.7905    0.7819      1112

Confusion Matrix:
[[265 182]
 [ 51 614]]
Accuracy:
0.7904676258992805


