In [361]:
import pandas as pd
import unicodedata
from nltk.corpus import stopwords
import nltk
import re
import math
import spotlight
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tag import StanfordNERTagger
from nltk.tokenize import word_tokenize
from io import StringIO
import os
import spacy
from spacy import displacy
from collections import Counter
from pprint import pprint
import en_core_web_sm

java_path = "C:\\Program Files\\Java\\jdk1.8.0_231\\bin\\java.exe"
os.environ['JAVAHOME'] = java_path

nlp = en_core_web_sm.load()
nltk.download('wordnet')
nltk.download("stopwords")
lemmatizer = WordNetLemmatizer()
pd.set_option('display.max_colwidth', -1)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kadss\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kadss\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [362]:
for_pd = StringIO()
with open('Data/2CVTweets/boston2C.csv',encoding="utf8") as f:
    for line in f:
        new_line = re.sub(r'NO;', 'NO',line)
        print (new_line, file=for_pd)

for_pd.seek(0)

df_city = pd.read_csv(for_pd, sep=';', header=None)


In [363]:

#read data of city

#df_city = pd.read_csv('Data/2CVTweets/df_city2C.csv', sep=";",header=None)
df_city = df_city.drop(df_city.columns[0], axis=1) ## index column
df_city.columns=['tweet','Y']


#convert class to upppercase and chang name

In [364]:
df_city.head(20)

Unnamed: 0,tweet,Y
0,"RT @CommuterBoston: ACCIDENT (Wellesley, MA): RT-9 East near Cedar St - involving a motorcycle and a car - possible lane restrictions",YES
1,RT @CommuterBoston: ACCIDENT: I-290 East before I-495 (Exit #26) - 2-car accident with possible lane blockages,YES
2,Opened report via iPhone at 19 Standard St http://t.co/8vc761aCHT. Road kill on sidewalk.,NO
3,RT @CommuterBoston: ACCIDENT REPORTED: I-93 North before Granite Av (Exit #11) - police investigating,YES
4,Duduk lebih dari 7 jam sehari berisiko depresi 47 persen lebih tinggi dibandingkan yang hanya duduk 4 jam sehari [HuffingtonPost],NO
5,"RT @CommuterBoston: ACCIDENT (Providence, RI): I-195 West near Gano St - right shoulder blocked",YES
6,Dedham Police Looking for Suspects After Car Crashes into Verizon Store http://t.co/u3zrVjliqB,YES
7,@NotifyBoston We need a don''t block the box ordinance... And traffic needs to be fixed on Atlantic Ave btwn SStation and the offramps,NO
8,# T-MOBILE NOWE HORYZONTY: Punk''s not dead - Wielkie oglądanie filmów we Wrocławiu trwa w najleps... http://t.co/UhtgxQES8x,NO
9,"ACCIDENT REPORTED (Danvers, MA): RT-1 North off-ramp to I-95 North - use caution",YES


## Pre-Processing

In [365]:
#unicode conversion
df_city['tweet_org']=df_city['tweet']
df_city['tweet'] = (df_city['tweet'].map(lambda x: unicodedata.normalize('NFKD', x))
              .str.encode('ascii', 'ignore'))

In [366]:
# slang conversion

In [367]:


from bs4 import BeautifulSoup
import requests, json
resp = requests.get('http://www.netlingo.com/acronyms.php')
soup = BeautifulSoup(resp.text, "html.parser")
slangdict= {}
key=""
value=""
for div in soup.findAll('div', attrs={'class':'list_box3'}):
    
    for li in div.findAll('li'):
        for a in li.findAll('a'):
            key =a.text
            value = li.text.split(key)[1]
            slangdict[key.lower()]=value

In [368]:
def slang_to_formal(input):
    i=1
    formal=[]
    for slang in input.split():
        i=i+1
        if slang.lower() in slangdict:
            formal.append(slangdict[slang])
        elif slang.isdigit():
            
            formal.append('D')  #not working
        else:
            formal.append(slang)
       
        
    return  " ".join(str(x) for x in formal)
            
        

In [369]:
df_city['tweet'] = df_city.apply(lambda row: slang_to_formal(row['tweet']), axis=1)

In [370]:
#replacing URL and digits

In [183]:
def replace_URL(tweet):
    return re.sub('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', 'URL', tweet)

    
   


df_city['tweet'] = df_city.apply(lambda row: replace_URL(row['tweet']), axis=1)

In [184]:
#remove stop words

stop = stopwords.words('english')
df_city['tweet'] = df_city.apply(lambda x: [item for item in x if item not in stop])

In [185]:
#tokenisation

df_city['tweet_tok_lem'] = df_city.apply(lambda row: nltk.word_tokenize(row['tweet']), axis=1)

In [186]:
#lemmatize
def lemmatize(s):
       
    s = [lemmatizer.lemmatize(word) for word in s]
    return s

df_city['tweet_tok_lem'] = df_city.apply(lambda row: lemmatize(row['tweet_tok_lem']), axis=1)


## Feature Extraction

In [187]:
# unigram and bigram features

# freq vector unigram

def initialize_ngram(corpus):
    v1 = CountVectorizer()
    v2 = CountVectorizer(2)
    
    v1.fit(corpus)
    v2.fit(corpus)
    return v1,v2

def convert_ngram_vec(text,v):
    text=' '.join(text)
    s=[]
    s.append(text)
    return v.transform(s).toarray()
    
all_tweets_corpus=df_city['tweet'].tolist()

v1,v2=initialize_ngram(all_tweets_corpus)

df_city['unigram_vec'] = df_city.apply(lambda row: convert_ngram_vec(row['tweet_tok_lem'],v1), axis=1)

df_city['bigram_vec'] = df_city.apply(lambda row: convert_ngram_vec(row['tweet_tok_lem'],v2), axis=1)


In [188]:
#tfidf scores

In [189]:
   
def compute_num_tokens(tweet_token_list):
    num_tokens = dict.fromkeys(uniqueWords, 0)
    for token in tweet_token_list:
        num_tokens[token] += 1
    return num_tokens

def apply_tfidf(tweet_token_list,idfs):
    num_tokens=compute_num_tokens(tweet_token_list)
    tfDict=computeTF(num_tokens,tweet_token_list)  
    
    return computeTFIDF(tfDict,idfs)

def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

def computeIDF(documents):

    N = len(documents)
    
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf


In [190]:

uniqueWords=set()
for tweet in df_city['tweet_tok_lem']:
    uniqueWords = uniqueWords.union(set(tweet))

In [191]:
idfs = computeIDF([compute_num_tokens(tweet) for tweet in df_city['tweet_tok_lem']])

In [192]:
df_city['tfid'] = df_city.apply(lambda row: apply_tfidf(row['tweet_tok_lem'],idfs), axis=1)

In [193]:
#syntactic features

In [194]:
df_city['no_ques_marks']=df_city.apply(lambda row: (row['tweet'].count("?")), axis=1)
df_city['no_excl_marks']=df_city.apply(lambda row: (row['tweet'].count("!")), axis=1)

df_city['no_uppercase']=df_city.apply(lambda row: sum(map(str.isupper, row['tweet'].split())), axis=1)

In [195]:
df_city[df_city.Y == 'NO']

Unnamed: 0,tweet,Y,tweet_org,tweet_tok_lem,unigram_vec,bigram_vec,tfid,no_ques_marks,no_excl_marks,no_uppercase
2,b'Opened' b'report' b'via' b'iPhone' b'at' D b'Standard' b'St' b'URL,NO,b'Opened report via iPhone at 19 Standard St http://t.co/8vc761aCHT. Road kill on sidewalk.',"[b'Opened, ', b'report, ', b'via, ', b'iPhone, ', b'at, ', D, b'Standard, ', b'St, ', b'URL]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.0, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",0,0,1
4,b'Duduk' b'lebih' b'dari' D b'jam' b'sehari' b'berisiko' b'depresi' D b'persen' b'lebih' b'tinggi' b'dibandingkan' b'yang' b'hanya' b'duduk' D b'jam' b'sehari' b'[HuffingtonPost]',NO,b'Duduk lebih dari 7 jam sehari berisiko depresi 47 persen lebih tinggi dibandingkan yang hanya duduk 4 jam sehari [HuffingtonPost]',"[b'Duduk, ', b'lebih, ', b'dari, ', D, b'jam, ', b'sehari, ', b'berisiko, ', b'depresi, ', D, b'persen, ', b'lebih, ', b'tinggi, ', b'dibandingkan, ', b'yang, ', b'hanya, ', b'duduk, ', D, b'jam, ', b'sehari, ', b, ', [, HuffingtonPost, ], ']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.0, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",0,0,3
7,"b'@NotifyBoston' b'We' b'need' b'a' b""don''t"" b'block' b'the' b'box' b'ordinance...' b'And' b'traffic' b'needs' b'to' b'be' b'fixed' b'on' b'Atlantic' b'Ave' b'btwn' b'SStation' b'and' b'the' b'offramps'",NO,"b""@NotifyBoston We need a don''t block the box ordinance... And traffic needs to be fixed on Atlantic Ave btwn SStation and the offramps""","[b, ', @, NotifyBoston, ', b'We, ', b'need, ', b, ', a, ', b, '', don, '', t, '', b'block, ', b'the, ', b'box, ', b'ordinance, ..., ', b'And, ', b'traffic, ', b'needs, ', b'to, ', b'be, ', b'fixed, ', b'on, ', b'Atlantic, ', b'Ave, ', b'btwn, ', b'SStation, ', b'and, ', b'the, ', b'offramps, ']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.024132993389102186, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",0,0,0
8,"b'#' b'T-MOBILE' b'NOWE' b'HORYZONTY:' b""Punk''s"" b'not' b'dead' b'-' b'Wielkie' b'ogladanie' b'filmow' b'we' b'Wrocawiu' b'trwa' b'w' b'najleps...' b'URL",NO,"b""# T-MOBILE NOWE HORYZONTY: Punk''s not dead - Wielkie ogladanie filmow we Wrocawiu trwa w najleps... http://t.co/UhtgxQES8x""","[b, ', #, ', b'T-MOBILE, ', b'NOWE, ', b'HORYZONTY, :, ', b, '', Punk, '', s, '', b'not, ', b'dead, ', b'-, ', b'Wielkie, ', b'ogladanie, ', b'filmow, ', b'we, ', b'Wrocawiu, ', b'trwa, ', b, ', w, ', b'najleps, ..., ', b'URL]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.0, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",0,0,0
11,"b'Look,' b'your' b'voice' b'is' b'lovely' b'but' b'that' b'breath' b'is' b'not.' b'Please' b'stop' b'serenading' b'the' b'subway' b'car' b'at' b'8am' b'with' b'that' b'vial' b'stench.' b'#commuterproblems'",NO,"b'Look, your voice is lovely but that breath is not. Please stop serenading the subway car at 8am with that vial stench. #commuterproblems'","[b'Look, ,, ', b'your, ', b'voice, ', b'is, ', b'lovely, ', b'but, ', b'that, ', b'breath, ', b'is, ', b'not, ., ', b'Please, ', b'stop, ', b'serenading, ', b'the, ', b'subway, ', b'car, ', b'at, ', b'8am, ', b'with, ', b'that, ', b'vial, ', b'stench, ., ', b, ', #, commuterproblems, ']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.0, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",0,0,0
18,"b'Is' b'Tyler' b'Zeller' b'a' b'Legitimate' b'Building' b'Block' b'for' b'Boston' b""Celtics''"" b'Future?' b'-' b'Bleacher' b'Report' b'URL#NBA' b'#BostonCeltics'",NO,"b""Is Tyler Zeller a Legitimate Building Block for Boston Celtics'' Future? - Bleacher Report http://t.co/60v8mmseIX #NBA #BostonCeltics""","[b'Is, ', b'Tyler, ', b'Zeller, ', b, ', a, ', b'Legitimate, ', b'Building, ', b'Block, ', b'for, ', b'Boston, ', b, '', Celtics, '', '', b'Future, ?, ', b'-, ', b'Bleacher, ', b'Report, ', b'URL, #, NBA, ', b, ', #, BostonCeltics, ']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.0, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",1,0,0
19,"b'ROADWORK' b'(Groton,' b'CT):' b'I-95' b'SB' b'between' b'Clarence' b'B.' b'Sharp' b'Hwy' b'(Exit' b'#87)' b'and' b'U.S.' b'Sub' b'Base/Gales' b'Ferry' b'(Exit' b'#86)' b'-three' b'right' b'lanes' b'blocked'",NO,"b'ROADWORK (Groton, CT): I-95 SB between Clarence B. Sharp Hwy (Exit #87) and U.S. Sub Base/Gales Ferry (Exit #86) -three right lanes blocked'","[b'ROADWORK, ', b, ', (, Groton, ,, ', b'CT, ), :, ', b, ', I-95, ', b'SB, ', b'between, ', b'Clarence, ', b, ', B, ., ', b'Sharp, ', b'Hwy, ', b, ', (, Exit, ', b, ', #, 87, ), ', b'and, ', b, ', U.S, ., ', b'Sub, ', b'Base/Gales, ', b'Ferry, ', b, ', (, Exit, ', b, ', #, 86, ), ', b'-three, ', b'right, ', b'lanes, ', b'blocked, ']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.0, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",0,0,0
20,"b'Iced' b'coffee' b'from' b'@DunkinDonuts' b'boat' b'shoes' b'because' b""there''s"" b'a' b'boat' b'involved' b'and' b'we' b'are' b'good' b'to' b'go' b'#sendsorboni' b'#whales' b'URL",NO,"b""Iced coffee from @DunkinDonuts boat shoes because there''s a boat involved and we are good to go #sendsorboni #whales http://t.co/spRgajNxZL""","[b'Iced, ', b'coffee, ', b'from, ', b, ', @, DunkinDonuts, ', b'boat, ', b'shoes, ', b'because, ', b, '', there, '', s, '', b, ', a, ', b'boat, ', b'involved, ', b'and, ', b'we, ', b'are, ', b'good, ', b'to, ', b'go, ', b, ', #, sendsorboni, ', b, ', #, whale, ', b'URL]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.025026807959068934, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",0,0,0
21,"b'When' b""you''re"" b'dead,' b'you' b""don''t"" b'know' b""you''re"" b'dead.' b""It''s"" b'the' b'same' b'way' b'when' b""you''re"" b'stupid.'",NO,"b""When you''re dead, you don''t know you''re dead. It''s the same way when you''re stupid.""","[b'When, ', b, '', you, '', re, '', b'dead, ,, ', b'you, ', b, '', don, '', t, '', b'know, ', b, '', you, '', re, '', b'dead, ., ', b, '', It, '', s, '', b'the, ', b'same, ', b'way, ', b'when, ', b, '', you, '', re, '', b'stupid, ., ']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.0, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",0,0,0
22,"b'RT' b'@sorbonified:' b'Iced' b'coffee' b'from' b'@DunkinDonuts' b'boat' b'shoes' b'because' b""there''s"" b'a' b'boat' b'involved' b'and' b'we' b'are' b'good' b'to' b'go' b'#sendsorboni' b'#whales' b'http:...'",NO,"b""RT @sorbonified: Iced coffee from @DunkinDonuts boat shoes because there''s a boat involved and we are good to go #sendsorboni #whales http:...""","[b'RT, ', b, ', @, sorbonified, :, ', b'Iced, ', b'coffee, ', b'from, ', b, ', @, DunkinDonuts, ', b'boat, ', b'shoes, ', b'because, ', b, '', there, '', s, '', b, ', a, ', b'boat, ', b'involved, ', b'and, ', b'we, ', b'are, ', b'good, ', b'to, ', b'go, ', b, ', #, sendsorboni, ', b, ', #, whale, ', b'http, :, ..., ']","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...]]","{'b'Munnell': 0.0, 'b'Scar': 0.0, 'b'M/A': 0.0, 'b'Reall': 0.0, 'b'reach': 0.0, 'b'wenn': 0.0, 'b'number': 0.0, 'b'situations': 0.0, 'b'Blunt': 0.0, 'b'11am-1pm': 0.0, 'b'nearby': 0.0, 'b'Rogers': 0.0, 'MithayChawl': 0.0, 'b'filled': 0.0, 'b'Immediately': 0.0, 'Training': 0.0, 'b'Department': 0.0, 'July': 0.0, 'b'break': 0.0, 'kinda': 0.0, 'b'WILL': 0.0, 'b'Pearl': 0.0, 'b'hell': 0.0, 'b'olahraga': 0.0, 'b'Nearly': 0.0, 'b'roommates': 0.0, 'Landeskoging': 0.0, 'b'Niemal': 0.0, 'GKMTNtwits': 0.0, 'b'ascent': 0.0, 'b'praying': 0.0, 'artraffic': 0.0, 'b'ootd': 0.0, 'b'A1': 0.0, 'BOS': 0.0, 'b'grains': 0.0, 'b'Parmale': 0.0, 'b'construction': 0.0, 'b'moze': 0.0, 'PMC2014': 0.0, 'b'predict': 0.0, 'b'devs': 0.0, 'b'Electric': 0.0, 'b'grad': 0.0, 'b'important': 0.0, 'andy_willmer': 0.0, 'b'nan': 0.0, 'I-84': 0.0, 'b'Combustible': 0.0, 'the_real_bowman': 0.0, 'b'tornado': 0.0, 'b'manchen': 0.0, 'b'JURY': 0.0, 'b'arrows': 0.0, 'b'message': 0.0, 'b'Tow': 0.0, 'b'Ugh': 0.0, 'Normile10': 0.0, 'doe': 0.0, 'b'EBW': 0.0, 'm': 0.0, 'b'Tried': 0.0, 'b'ruled': 0.0, 'b'asleep': 0.0, 'b'pours': 0.0, '46': 0.0, 'b'Blows': 0.0, 'bomani_jones': 0.0, 'b'to': 0.020791501996764963, 'b'presented': 0.0, 'b'Co-patrocinado': 0.0, 'b'Zamku': 0.0, 'b'include': 0.0, 'b'nearly': 0.0, 'b'image': 0.0, 'b'complete': 0.0, 'b'recipe': 0.0, 'b'realize': 0.0, 'illBBock': 0.0, 'NITM': 0.0, 'b'Closet': 0.0, 'b'smells': 0.0, 'b'Carbon': 0.0, 'whiplash': 0.0, 'b'roku': 0.0, 'b'Homeowners': 0.0, 'b'Guessing': 0.0, 'b'looking': 0.0, 'Mowafag_libya': 0.0, 'b'widz': 0.0, 'b'WATCH': 0.0, 'b'WHAT': 0.0, 'b'Chokehold': 0.0, 'b'gritty': 0.0, 'b'tapped': 0.0, 'b'Rental': 0.0, 'b'received': 0.0, 'b'Robberies': 0.0, 'b'Five': 0.0, 'b'Karma': 0.0, ...}",0,0,0


In [196]:

#Number same URI (NER-DBPedia)

In [197]:
from collections import Counter

def duplicates(values):
    
    dups = Counter(values) - Counter(set(values))
    return sum(dups.values()) + len(dups)

In [198]:
#DBPedia api hit
def get_annotation(tweet):
    try:
        annotations = spotlight.annotate('http://api.dbpedia-spotlight.org/en/annotate',tweet,
                              confidence=0.4, support=20)
    except Exception:
        return {'URI':[],'types':[]}
    
    URI_lst=[]
    types_lst=[]
    
    for ann_dict in annotations:
        URI_lst.append(ann_dict['URI'])
        types_lst.append(ann_dict['types'])
    
    return {'URI':URI_lst,'types':types_lst}

In [199]:
#annotation features

#global 
accident_types=set()
non_accident_types=set()
    
def create_accident_types(ann_types,Y):
  
       
    for types in ann_types:
        if Y=='YES':
            accident_types.add(types)
        else:
            non_accident_types.add(types)
    
   # return accident_types,non_accident_types



In [200]:
df_city['annotations'] = df_city.apply(lambda row: get_annotation(row['tweet_org']), axis=1)



In [201]:
for ann,Y in zip(df_city.annotations, df_city.Y):
    if 'types' in ann:
        create_accident_types(ann['types'],Y)
print(non_accident_types)                                  

{'', 'Wikidata:Q486972,Schema:Place,DBpedia:Settlement,DBpedia:PopulatedPlace,DBpedia:Place,DBpedia:Location', 'Http://xmlns.com/foaf/0.1/Person,Wikidata:Q5,Wikidata:Q483501,Wikidata:Q24229398,Wikidata:Q215627,DUL:NaturalPerson,DUL:Agent,Schema:Person,DBpedia:Person,DBpedia:Artist,DBpedia:Agent', 'Wikidata:Q43229,Wikidata:Q24229398,Wikidata:Q163740,DUL:SocialPerson,DUL:Agent,Schema:Organization,DBpedia:Organisation,DBpedia:Non-ProfitOrganisation,DBpedia:Agent', 'Wikidata:Q43229,Wikidata:Q24229398,DUL:SocialPerson,DUL:Agent,Schema:Organization,DBpedia:Organisation,DBpedia:Company,DBpedia:BusCompany,DBpedia:Agent', 'Schema:Place,DBpedia:Venue,DBpedia:Place,DBpedia:Location,DBpedia:ArchitecturalStructure', 'Wikidata:Q386724,Schema:WebPage,Schema:CreativeWork,DBpedia:Work,DBpedia:Website', 'Wikidata:Q7397,Wikidata:Q386724,Schema:CreativeWork,DBpedia:Work,DBpedia:Software', 'Wikidata:Q1248784,Schema:Place,Schema:Airport,DBpedia:Place,DBpedia:Location,DBpedia:Infrastructure,DBpedia:Architect

In [202]:
df_city['num_same_URI'] = df_city.apply(lambda row: duplicates(row['annotations']['URI']), axis=1)
df_city['acc_overlap_types'] = df_city.apply(lambda row: (len(accident_types.intersection(row['annotations']['types']))/len(accident_types)), axis=1)
df_city['non_acc_overlap_types'] = df_city.apply(lambda row: (len(non_accident_types.intersection(row['annotations']['types']))/len(non_accident_types)), axis=1)

In [223]:

st = StanfordNERTagger('Data/english.all.3class.distsim.crf.ser.gz',
					   'Data/stanford-ner.jar',
					   encoding='utf-8')


def get_location_count(text):
    doc = nlp(text)
  #  pprint([(X.text, X.label_) for X in doc.ents])
   # tokenized_text = word_tokenize(text)
   # classified_text = st.tag(tokenized_text)
    
    
    return sum([1  for X in doc.ents if X.label_=='GPE'])


In [224]:
df_city['num_locations'] = df_city.apply(lambda row: get_location_count(row['tweet']), axis=1)

In [264]:
features = df_city[['unigram_vec','bigram_vec','tfid','no_ques_marks', 'no_excl_marks', 'no_uppercase', 'annotations',
       'num_same_URI', 'acc_overlap_types', 'non_acc_overlap_types',
       'num_locations']]

labels = df_city.loc[:, 'Y']
processed_features=[]

for col in features.columns:
    processed_features.append(features[col].tolist())
    


In [251]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(processed_features, labels, test_size=0.2, random_state=0)

ValueError: Found input variables with inconsistent numbers of samples: [11, 2820]

In [271]:
## converting to transform objects

In [241]:
vocabulary = "a list of words I want to look for in the documents".split()
vect = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word', 
           stop_words='english', vocabulary=vocabulary)

2820

In [319]:
df_city[['tweet']]

Unnamed: 0,tweet
0,b'RT' b' CommuterBoston ' b'ACCIDENT' b' Wellesley ' b'MA ' b'RT-9' b'East' b'near' b'Cedar' b'St' b'-' b'involving' b'a' b'motorcycle' b'and' b'a' b'car' b'-' b'possible' b'lane' b'restrictions'
1,b'RT' b' CommuterBoston ' b'ACCIDENT ' b'I-290' b'East' b'before' b'I-495' b' Exit' b' 26 ' b'-' b'2-car' b'accident' b'with' b'possible' b'lane' b'blockages'
2,b'Opened' b'report' b'via' b'iPhone' b'at' D b'Standard' b'St' b'URL
3,b'RT' b' CommuterBoston ' b'ACCIDENT' b'REPORTED ' b'I-93' b'North' b'before' b'Granite' b'Av' b' Exit' b' 11 ' b'-' b'police' b'investigating'
4,b'Duduk' b'lebih' b'dari' D b'jam' b'sehari' b'berisiko' b'depresi' D b'persen' b'lebih' b'tinggi' b'dibandingkan' b'yang' b'hanya' b'duduk' D b'jam' b'sehari' b'[HuffingtonPost]'
5,b'RT' b' CommuterBoston ' b'ACCIDENT' b' Providence ' b'RI ' b'I-195' b'West' b'near' b'Gano' b'St' b'-' b'right' b'shoulder' b'blocked'
6,b'Dedham' b'Police' b'Looking' b'for' b'Suspects' b'After' b'Car' b'Crashes' b'into' b'Verizon' b'Store' b'URL
7,"b' NotifyBoston' b'We' b'need' b'a' b""don''t"" b'block' b'the' b'box' b'ordinance ' b'And' b'traffic' b'needs' b'to' b'be' b'fixed' b'on' b'Atlantic' b'Ave' b'btwn' b'SStation' b'and' b'the' b'offramps'"
8,"b' ' b'T-MOBILE' b'NOWE' b'HORYZONTY ' b""Punk''s"" b'not' b'dead' b'-' b'Wielkie' b'ogladanie' b'filmow' b'we' b'Wrocawiu' b'trwa' b'w' b'najleps ' b'URL"
9,b'ACCIDENT' b'REPORTED' b' Danvers ' b'MA ' b'RT-1' b'North' b'off-ramp' b'to' b'I-95' b'North' b'-' b'use' b'caution'


ValueError: setting an array element with a sequence.

In [293]:
import string
def remove_punctuation(tweet):
    clean = re.sub(r"""
               [,:.;@([#)]?!&$]+  # Accept one or more copies of punctuation
               \ *           # plus zero or more copies of a space,
               """,
               " ",          # and replace it with a single space
               tweet, flags=re.VERBOSE)
    return(clean)
df_city['tweet'] = df_city.apply(lambda row: remove_punctuation(row['tweet']), axis=1)



In [296]:
all_tweets_corpus=df_city['tweet'].tolist()
vocabulary = [item for sublist in all_tweets_corpus for item in sublist.split()]

In [299]:
vect = TfidfVectorizer(sublinear_tf=True, max_df=0.5, analyzer='word', 
           stop_words='english')

In [301]:
vect.fit(all_tweets_corpus)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=0.5, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=True, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [308]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA, TruncatedSVD
union = FeatureUnion([('TfIdf', TfidfVectorizer(min_df=1, max_df=0.9, strip_accents='unicode', norm='l2')),('TfIdf2', TfidfVectorizer(min_df=1, max_df=0.9, ngram_range=(2,2), strip_accents='unicode', norm='l2'))])


union.fit_transform(all_tweets_corpus) 

<2820x33866 sparse matrix of type '<class 'numpy.float64'>'
	with 84031 stored elements in Compressed Sparse Row format>

In [316]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
clf = SVC(gamma='scale',kernel='rbf')
#clf = SVR(kernel='linear')
#clf.fit(union, labels)




feature_pipeline = Pipeline([('union', union),('rf',RandomForestClassifier())]) 

p=feature_pipeline.fit(all_tweets_corpus) 




TypeError: Singleton array array(None, dtype=object) cannot be considered a valid collection.

In [330]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
#from xgboost import XGBClassifier
from nltk.corpus import stopwords
nltk.download("stopwords")

stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kadss\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [350]:
classifier = Pipeline([
    ('features', FeatureUnion([
        ('tfidf_reduced', Pipeline([
            ('colext', TextSelector('tweet')),
            ('tfidf', TfidfVectorizer(tokenizer=Tokenizer, stop_words=stop_words,
                     min_df=.0025, max_df=0.25, ngram_range=(1,2))),
            ('svd', TruncatedSVD(algorithm='randomized', n_components=300)), #for XGB
        ])),
        ('question_marks', Pipeline([
            ('wordext', NumberSelector('no_ques_marks')),
            ('wscaler', StandardScaler()),
        ])),
         ('no_excl_marks', Pipeline([
            ('wordext', NumberSelector('no_excl_marks')),
            ('wscaler', StandardScaler()),
        ])),
         ('no_uppercase', Pipeline([
            ('wordext', NumberSelector('no_uppercase')),
            ('wscaler', StandardScaler()),
        ])),
         ('num_locations', Pipeline([
            ('wordext', NumberSelector('num_locations')),
            ('wscaler', StandardScaler()),
        ])),
         ('num_same_URI', Pipeline([
            ('wordext', NumberSelector('num_same_URI')),
            ('wscaler', StandardScaler()),
        ])),
         ('acc_overlap_types', Pipeline([
            ('wordext', NumberSelector('acc_overlap_types')),
        
        ])),
         ('non_acc_overlap_types', Pipeline([
            ('wordext', NumberSelector('non_acc_overlap_types')),
           
        ])),
    ])),
#    ('clf', XGBClassifier(max_depth=3, n_estimators=300, learning_rate=0.1)),
    ('clf', RandomForestClassifier()),
    ])

In [349]:
from sklearn.base import BaseEstimator, TransformerMixin
class TextSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[self.field]
class NumberSelector(BaseEstimator, TransformerMixin):
    def __init__(self, field):
        self.field = field
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return X[[self.field]]
    
    
import nltk
def Tokenizer(str_input):
    words = re.sub(r"[^A-Za-z0-9\-]", " ", str_input).lower().split()
    porter_stemmer=nltk.PorterStemmer()
   # words = [porter_stemmer.stem(word) for word in words]
    return words

In [351]:
X =df_city[['tweet','unigram_vec','bigram_vec','tfid','no_ques_marks', 'no_excl_marks', 'no_uppercase', 'annotations',
       'num_same_URI', 'acc_overlap_types', 'non_acc_overlap_types',
       'num_locations']]
Y = labels
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25)


classifier.fit(X_train, y_train)
preds = classifier.predict(X_test)



In [353]:
from sklearn.metrics import accuracy_score, precision_score, classification_report, confusion_matrix
print("Accuracy:", accuracy_score(y_test, preds))
#print("Precision:", precision_score(y_test, preds))
print(classification_report(y_test, preds))
print(confusion_matrix(y_test, preds))

Accuracy: 0.8609929078014185
              precision    recall  f1-score   support

          NO       0.85      0.98      0.91       526
         YES       0.92      0.50      0.64       179

    accuracy                           0.86       705
   macro avg       0.88      0.74      0.78       705
weighted avg       0.87      0.86      0.85       705

[[518   8]
 [ 90  89]]


In [355]:
from sklearn.metrics import f1_score

f1_score(y_test, preds, average='weighted') 

0.8453691331376602