# Cleansing csv and storing in Pandas DataFrame

In [1]:
import pandas as pd
import numpy as np
import re
filename = '/home/b/Downloads/sts_gold_tweet(1).csv'

### Scrubing the data to return ID, polarity and tweet as a Pandas Series
 - Removes double quotes 
 - Splits string into id, polarity and tweet with ';' as a seperator
 - Strips extra spaces

In [2]:
def cleanser(i):
    elements = i.replace('"','').split(';')
    if len(elements) > 3:
        elements[2]=';'.join(elements[2:])
    ID, polarity, tweet = elements[:3]
    tweet = re.sub(r'\s+', ' ',tweet)
    return pd.Series([ID,polarity,tweet])

### Reading the CSV and applying cleanser to all rows

In [3]:
# Reading the CSV
df = pd.read_csv(filename)

# Applying Cleanser to all rows
df = df['id;"polarity";"tweet"'].apply(cleanser)

# Renaming columns appropriately
df = df.rename(columns = {0:'id',1:'polarity',2:'tweets'})

df.head()

Unnamed: 0,id,polarity,tweets
0,1467933112,0,the angel is going to miss the athlete this we...
1,2323395086,0,It looks as though Shaq is getting traded to C...
2,1467968979,0,@clarianne APRIL 9TH ISN'T COMING SOON ENOUGH
3,1990283756,0,drinking a McDonalds coffee and not understand...
4,1988884918,0,So dissapointed Taylor Swift doesnt have a Twi...


# Feature Extraction with spaCy

In [4]:
# Loading spaCy
import spacy
nlp=spacy.load('en')

In [5]:
# Appending additonal stopwords (besides spaCy's in-built set of stopwords)
def add_stopwords(words):
    for w in words:
        nlp.vocab[unicode(w)].is_stop = True

add_stopwords(["n't","u","&","'s","'ve","is","am","is","was","were","'m","'re","m","ai","#"])


## Generating a vocabulary from given tweets
- Ignores punctuations and smileys by default
- Ignores stopwords
- Ignores twitter handles that appear only once (Checks lowercase strings since twitter handles are case-insensitive)
- Presumes sentiment of hashtag to be the same as the word enclosed
    Hence #Sony is considered to be Sony
- Generates **word value** from words
    - Function of POS as well as word lemma
    - Helps distinguish homonyms (since homonyms have the same lemma but different POS)

### Data Structures used : 
- **vocab** - Dictionary mapping word value to document frequency (Number of times the word appears in the document)
- **dictionary** - Dictionary mapping word value to word
- **handles** - Set of twitter handles



In [6]:
vocab ={}
dictionary ={}
handles=set([])
def generate_vocabulary(sent,ignore_punctuation=True,ignore_links = True):
    try : sent = unicode(sent)
    except : return
    for i in nlp(sent):
        if ignore_punctuation:
            if int(i.pos) == 95:
                continue
        if ignore_links:
            if i.text.startswith('http://'):
                continue
        if not i.is_stop:
            init_count = 0
            if i.text.startswith('@'):
                if i.text.lower() not in handles:
                    handles.add(i.text.lower())
                    continue
                else:init_count = 1
            lemma_val,pos_val = i.lemma,i.pos
            word_val = lemma_val*100+pos_val
            vocab[word_val]=vocab.get(word_val,init_count)+1
            dictionary[word_val] = i

df['tweets'].apply(generate_vocabulary)
print 'Length of vocabulary :'.ljust(25),len(vocab)
print 'Length of dictionary :'.ljust(25),len(dictionary)
print 'Number of handles found :'.ljust(25),len(handles)

Length of vocabulary :    4336
Length of dictionary :    4336
Number of handles found : 619


### Picking the right features
- Sorts features (words) by document frequency
- Ignores the first **h** features with the greatest document frequency
- Ignores the first **l** features with the least document frequency
- Returns list of word values

In [7]:
def get_feature_ids(h,l):
    global vocab
    keys=vocab.keys()
    keys.sort(key=lambda x:vocab[x],reverse=True)
    return keys[h:-l]

#for i in get_feature_ids(1,4002):print i,dictionary[i],vocab[i]

## Converting sentences to count vectors
- Use **get_feature_ids** to fetch important feature-IDs/word-values
- Create a numpy **m\*n** array *vector* where m: Number of sentences and n:Number of features
- Increments the count of a wordval if word is present in a sentence
- verbose = True prints the Feature Words and index of wordval in the vector
- Returns *vector*

In [8]:
def get_features(sentences,high=0,low = 500,verbose=False):
    global vocab
    feature_ids = get_feature_ids(high,low)
    #print feature_ids
    number_of_features = len(feature_ids)
    vector = np.array([[[0]*number_of_features]*len(sentences)])
    counter = 0
    for sent in sentences:
        #print sent
        try : uni = unicode(sent)
        except UnicodeDecodeError: continue
        if verbose:
            print "\n\nTweet :",sent,'\nFeature Words :',
        for i in nlp(uni):
            lemma_val,pos_val = i.lemma,i.pos
            word_val = lemma_val*100+pos_val
            #print i , word_val
            if word_val in feature_ids:
                index = feature_ids.index(word_val)
                if verbose : 
                    print i.text+" ("+str(index)+") ",
                vector[0][counter][index]+=1
        counter +=1
    return vector

#get_features(df.loc[:1000]['tweets'].values,verbose=True)

## Obtaining final set of features by applyinf TF-IDF on word-count vector
 - Balances weightage given to terms occuring frequent and rare words
 - **getTermFreq** converts word-count vector to tfidf-vector
 - **tweets2features** converts a list of tweets to scipy sparse matrix

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_model = None

def getTermFreq(counts,idf):
    global tfidf_model
    if tfidf_model is None:
        tfidf_model = TfidfTransformer(use_idf=idf)
        tfidf_model.fit(counts)
    tfidf_vector = tfidf_model.transform(counts)
    return tfidf_vector

In [17]:
from scipy import sparse


def tweets2features(data,vb,idf=True):
    counts = get_features(data,verbose=vb)[0]
    #print np.shape(counts)
    sparse_counts = sparse.csr_matrix(counts)
    print "\n\n Word-Counts as a sparse matrix"
    print sparse_counts
    train_tf = getTermFreq(sparse_counts,idf)
    return train_tf

tfidf_vector = tweets2features(df['tweets'].values ,vb=True,idf=True)
print "\n\n Features extracted"
print tfidf_vector



Tweet : the angel is going to miss the athlete this weekend  
Feature Words : angel (1324)  is (3020)  going (0)  miss (19)  athlete (3270)  weekend (153)  

Tweet : It looks as though Shaq is getting traded to Cleveland to play w/ LeBron... Too bad for Suns' fans. The Big Cactus is no more  
Feature Words : looks (21)  Shaq (727)  is (3020)  getting (1)  Cleveland (548)  play (38)  w/ (3573)  LeBron (35)  bad (26)  Suns (422)  ' (339)  fans (140)  Big (142)  Cactus (2483)  is (3020)  

Tweet : @clarianne APRIL 9TH ISN'T COMING SOON ENOUGH  
Feature Words : APRIL (1307)  9TH (3293)  ISN'T (2195)  COMING (2838)  SOON (2441)  

Tweet : drinking a McDonalds coffee and not understanding why someone would hurt me for no apparent reason.  
Feature Words : drinking (284)  McDonalds (55)  coffee (167)  not (10)  understanding (686)  would (194)  hurt (151)  apparent (2085)  reason (489)  

Tweet : So dissapointed Taylor Swift doesnt have a Twitter  
Feature Words : dissapointed (1642)  Taylo

Feature Words : Chrisette (1539)  Michelle (1122)  came (17)  ipod (92)  's (3020)  mellow (3344)  

Tweet : Lakers! Going to the finals!! Weee!  
Feature Words : Lakers (52)  Going (0)  finals (134)  Weee (1612)  

Tweet : Getting my hair cut while texting with my brother and getting updates about Iran &amp; the baseball game. God 
Feature Words : Getting (1)  hair (291)  cut (1558)  texting (2996)  brother (228)  getting (1)  updates (209)  Iran (1801)  amp (29)  baseball (3354)  game (44)  God (1149)  

Tweet : @ILUVNKOTB he wants u to follow who he follows on twitter. some very nice organizations  
Feature Words : wants (4)  u (488)  follow (180)  follows (180)  twitter (113)  nice (72)  organizations (3665)  

Tweet : @dmf71 rrrrrr you so very sweet a big hi to you!!!!!!!  
Feature Words : rrrrrr (1827)  sweet (300)  big (142)  hi (1244)  

Tweet : @scrambledeggos Seattle is very nice. So green it reminds me of Germany  
Feature Words : Seattle (45)  is (3020)  nice (72)  green (6

Feature Words : have (69)  sore (395)  throat (321)  think (8)  have (69)  fever (27)  not (10)  good (2)  

Tweet : Ashton is going to be Oprah! All hail the Twitter King! LOL! 
Feature Words : Ashton (2959)  is (3020)  going (0)  be (3020)  Oprah (99)  hail (2709)  Twitter (161)  King (1287)  LOL (184)  

Tweet : Listening to Bjork's All is full of Love... crying  
Feature Words : Listening (145)  is (3020)  Love (1144)  crying (154)  

Tweet : @ImajicArt 3 month cancer battle.  
Feature Words : 3 (46)  month (232)  cancer (14)  battle (453)  

Tweet : Is it raining in the north end? Too bad 
Feature Words : Is (3020)  raining (225)  north (2597)  end (714)  bad (26)  

Tweet : this week is not going as i had hoped  
Feature Words : week (98)  is (3020)  not (10)  going (0)  had (69)  hoped (189)  

Tweet : I'm really cold. I don't want to go to sleep yet but there's nothing to do  
Feature Words : 'm (3020)  cold (330)  n't (10)  want (4)  go (0)  sleep (306)  's (3020)  

Tweet : @

Feature Words : Hey (103)  tell (90)  use (263)  Xbox (59)  360 (267)  use (263)  Twitter (161)  would (194)  be (3020)  Sweet (386)  

Tweet : haha realized today my dad says &quot;back home&quot; not &quot;in England&quot; I like that he has his priorities  
Feature Words : haha (136)  realized (1109)  today (5)  dad (246)  says (79)  quot;back (2048)  home&quot (2120)  not (10)  quot;in (2189)  England&quot (2322)  like (96)  has (69)  priorities (1036)  

Tweet : Listening to love story by taylor swift in the car and singing along  
Feature Words : Listening (145)  love (11)  story (289)  taylor (68)  swift (170)  car (230)  singing (2633)  

Tweet : Youtube and Facebook ftw!  
Feature Words : Youtube (89)  Facebook (81)  ftw (1233)  

Tweet : Red Devils champion of England for the 3rd time in a row. Next stop CL title  
Feature Words : Red (682)  Devils (2243)  champion (3349)  England (51)  3rd (1164)  time (13)  row (664)  stop (530)  CL (2485)  title (790)  

Tweet : @BrianVilo

Tweet : Is lying in bed with a babe  
Feature Words : Is (3020)  lying (558)  bed (65)  babe (1452)  

Tweet : @ashleyxtellez watching that 70s waiting for my headache to go away lasted all day... 
Feature Words : watching (6)  70s (1964)  waiting (20)  headache (7)  go (0)  away (115)  day (3)  

Tweet : England trip update: just saw stonehenge and now heading to london!  
Feature Words : England (51)  trip (172)  update (209)  saw (64)  stonehenge (3485)  heading (262)  london (152)  

Tweet : @imagejennation @whitrt we found a great Chinese place to hang out at  
Feature Words : found (39)  great (41)  Chinese (1751)  place (171)  hang (229)  

Tweet : Just got done watching the new House episode.Definitely one of the saddest episodes ever.  
Feature Words : got (1)  watching (6)  new (18)  House (296)  episode (506)  Definitely (304)  saddest (9)  episodes (506)  

Tweet : Was intending to finish editing my 536-page novel manuscript tonight 
Feature Words : Was (3020)  intending (1

Feature Words : yeah (107)  guys (120)  will (121)  totally (245)  bring (252)  dublin (2197)  party (277)  would (194)  love (11)  company (467)  

Tweet : Home sweet home after a long day in D.C. with Bennett 
Feature Words : Home (95)  sweet (300)  home (77)  long (159)  day (3)  D.C. (3538)  Bennett (2054)  

Tweet : psyched about my starbucks card. thanks lynn!  
Feature Words : psyched (1276)  starbucks (36)  card (290)  thanks (61)  lynn (3188)  

Tweet : @jaspreetgill http://twitpic.com/6ubr9 - woooho! they get better and better! I'm watching ur videos on YouTube right now haha 
Feature Words : woooho (1521)  get (1)  better (2)  better (2)  'm (3020)  watching (6)  ur (763)  videos (78)  YouTube (89)  right (139)  haha (687)  

Tweet : @Oprah np gurl! that was a great show!  
Feature Words : @Oprah (157)  np (3529)  gurl (1266)  was (3020)  great (41)  show (626)  

Tweet : Goodmorning cali hi to my vegas family  
Feature Words : Goodmorning (859)  cali (1677)  hi (1244)  vega

Feature Words : @ddlovato (355)  HAVE (69)  SURPRISE (2526)  U (488)  COME (2464)  LONDON (48)  wait (20)  

Tweet : Dropped my iPhone 
Feature Words : Dropped (260)  iPhone (24)  

Tweet : wow 
Feature Words : wow (131)  

Tweet : @jokerrrr It stillllll hasn't arrived  
Feature Words : stillllll (3385)  hasn't (2758)  arrived (1278)  

Tweet : i think all musicians should release instrumental versions of their previous albums 
Feature Words : think (8)  musicians (867)  versions (415)  previous (2024)  albums (360)  

Tweet : I'm tired. I feel like crap. And the world feels all crummy. Make me happy 
Feature Words : 'm (3020)  tired (132)  feel (12)  like (25)  crap (200)  world (276)  feels (12)  crummy (3466)  Make (112)  happy (60)  

Tweet : okay; my computer officially hates me.. -.- can't go on the internet; can't render vids on sony vegas.. ugh..  
Feature Words : okay (781)  computer (683)  officially (266)  hates (333)  n't (10)  go (0)  internet (372)  n't (10)  render (1786

Feature Words : is (3020)  wishing (15)  was (3020)  going (0)  be (3020)  LA (342)  w/ (2731)  da (819)  fam (1285)  LAKERS (40)  Parade (3828)  got (1)  ta (425)  stay (628)  grind (2644)  operation (1009)  get (1)  diego (1057)  

Tweet : Xbox 360 RROD'd - again! Two times already 
Feature Words : Xbox (59)  360 (267)  RROD'd (2062)  times (13)  

Tweet : My sticker is about to fall off. my Breast Cancer sticker I got from Danny's party - I stuck it on my phone.  
Feature Words : sticker (1382)  is (3020)  fall (169)  Breast (393)  Cancer (14)  sticker (1382)  got (1)  Danny (1055)  party (277)  stuck (313)  phone (162)  

Tweet : obama 
Feature Words : obama (240)  

Tweet : @nrg07 i knooww.. i once watched marley and me and oprah at the same day 
Feature Words : knooww (2206)  watched (6)  marley (1649)  oprah (1473)  day (3)  

Tweet : @mattmacdonaldis epic loss I was disappointed in LeBron 
Feature Words : epic (1770)  loss (550)  was (3020)  disappointed (947)  LeBron (35)  

T

Feature Words : way (119)  vegas (108)  looking (21)  fierce (2274)  amp (1113)  bronzed (2277)  

Tweet : England winning by 2 goals to nil right now  
Feature Words : England (51)  winning (49)  2 (31)  goals (1027)  nil (2548)  right (139)  

Tweet : I want to meet Kevin Jonas in person. and Coco Martin. and Taylor Swift.  
Feature Words : want (4)  meet (141)  Kevin (2999)  Jonas (419)  person (738)  Coco (2265)  Martin (3441)  Taylor (34)  Swift (37)  

Tweet : @TDLQ Well it was just so so close the whole time! But at least they pulled it out! WOOT LAKERS!  
Feature Words : was (3020)  close (417)  time (13)  pulled (482)  WOOT (2040)  LAKERS (52)  

Tweet : Good night and good day twitters!  
Feature Words : Good (2)  night (16)  good (2)  day (3)  twitters (113)  

Tweet : This is awesome 
Feature Words : is (3020)  awesome (128)  

Tweet : Great stuff this wk: my bike ride in the wild on Sun.; Obama's speech in Cairo; last episode of Lost 3; Lakers 1st win; my wife's smile  
Fe

Feature Words : have (69)  bird (1388)  living (74)  have (69)  find (39)  ways (119)  kill (237)  damn (160)  things (105)  exterminator (3496)  vegas (108)  sucks (91)  

Tweet : I updated to OS 3.0 on my iPod Touch. A few apps have stopped working. http://bit.ly/17nwOf 
Feature Words : updated (347)  OS (2080)  3.0 (188)  iPod (87)  Touch (611)  apps (344)  have (69)  stopped (191)  working (30)  

Tweet : bedtime 
Feature Words : bedtime (967)  

Tweet : new music videos today in my YouTube Channel. thanks and please subscribe. lol. http://bit.ly/17NsuD 
Feature Words : new (18)  music (143)  videos (78)  today (5)  YouTube (89)  Channel (2952)  thanks (61)  subscribe (1022)  lol (53)  

Tweet : i'm thinking i'm blessed that ive a cmputr &amp; a fireplace on ths cold evening. Feeling content 
Feature Words : 'm (3020)  thinking (8)  'm (3020)  blessed (911)  ve (69)  cmputr (2600)  amp (29)  fireplace (3752)  cold (330)  evening (432)  Feeling (12)  content (2718)  

Tweet : @jazre

Feature Words : Right (634)  saw (64)  Vegas (50)  end (714)  April (1307)  were (3020)  awesome (128)  usual (749)  

Tweet : @RoundSparrow Awesome - hope it was great! I am about 2 hours away from seeing it at the Sydney (now second world viewing) premiere  
Feature Words : Awesome (2794)  hope (106)  was (3020)  great (41)  am (3020)  2 (31)  hours (80)  away (115)  seeing (64)  Sydney (58)  second (368)  world (276)  viewing (2849)  premiere (1167)  

Tweet : Like the tweetdeck for iPhone - can't sync columns ATM tho  
Feature Words : Like (25)  tweetdeck (623)  iPhone (24)  n't (10)  sync (2202)  columns (897)  tho (421)  

Tweet : sick! fever-ish 
Feature Words : sick (62)  fever (564)  ish (1345)  

Tweet : @MusicSnob75 I want starbucks and have no time to stop before work  
Feature Words : want (4)  starbucks (36)  have (69)  time (13)  stop (191)  work (23)  

Tweet : has a stupid stomach ache and fever  
Feature Words : has (69)  stupid (155)  stomach (1358)  ache (2334)  fev

Feature Words : Apple (680)  Charging (855)  Download (3736)  Apps (2328)  iPhone (24)  makes (112)  sence (2869)  

Tweet : I was watching Pride and Prejudice 
Feature Words : was (3020)  watching (6)  Pride (1010)  Prejudice (3343)  

Tweet : nice song by Taylor Swift http://bit.ly/hcsm7 
Feature Words : nice (72)  song (82)  Taylor (34)  Swift (37)  

Tweet : @Lee_Knight lmao! thanks Lee XD 
Feature Words : lmao (597)  thanks (61)  Lee (1169)  

Tweet : @ThePartyScene &quot;you dream in black and white 
Feature Words : quot;you (2194)  dream (555)  white (670)  

Tweet : Happy birthday sydney elizabeth mitchin  
Feature Words : Happy (60)  birthday (320)  sydney (109)  

Tweet : Love how my sister thinks she's Miley Cyrus or Taylor Swift  
Feature Words : Love (11)  sister (243)  thinks (8)  's (3020)  Miley (459)  Cyrus (620)  Taylor (34)  Swift (37)  

Tweet : @BrazillofBLAK cant wait 2 hear it  
Feature Words : nt (10)  wait (20)  2 (31)  hear (101)  

Tweet : @NILANTI atleast ur

Feature Words : 's (3020)  hoping (189)  come (17)  home (95)  Las (663)  Vegas (50)  bailout (2825)  

Tweet : Morning everyone! What a beautiful Day...Yay!  
Feature Words : Morning (1351)  beautiful (233)  Day (3)  Yay (1286)  

Tweet : Hacked my PSP with ChickHEN 
Feature Words : Hacked (640)  PSP (138)  ChickHEN (1510)  

Tweet : @meowmixfever we should go to the grocery store and buy a large box of them that'd make my life complete haha _Myana&lt;3 
Feature Words : go (0)  grocery (2950)  store (2980)  buy (149)  large (569)  box (1249)  'd (1401)  make (112)  life (150)  complete (1183)  haha (136)  

Tweet : @ddlovato yesterday &quot;sonny with a chance&quot; came to brazil 
Feature Words : @ddlovato (1182)  yesterday (125)  quot;sonny (3442)  chance&quot (3480)  came (17)  brazil (2469)  

Tweet : @sdownes1972 thx Stu will do!  
Feature Words : thx (3653)  Stu (1898)  will (121)  

Tweet : @DJTinaSapp Ha! I'll get it back to you as soon as possible!  
Feature Words : Ha (709) 

Feature Words : oh (28)  'm (3020)  sorry (117)  n't (10)  think (8)  retweeting (1391)  

Tweet : Matt was allowed a McDonalds. I wasnt. So unfair! All I had was yoghart!  
Feature Words : Matt (1068)  was (3020)  allowed (400)  McDonalds (55)  was (3020)  unfair (2845)  had (69)  was (3020)  yoghart (1837)  

Tweet : @Bonedwarf MGS4 isn't on XBox  
Feature Words : MGS4 (3526)  is (3020)  n't (10)  XBox (59)  

Tweet : @PrincessSuperC Cavs down by 20 3 mins left... just in case u arent watching.. its over for Lebron  
Feature Words : Cavs (3378)  20 (589)  3 (46)  mins (1284)  left (75)  case (343)  u (488)  are (3020)  nt (10)  watching (6)  Lebron (35)  

Tweet : @JonasAustralia absolutely nothing. like watching JONAS on youtube and homework  
Feature Words : @JonasAustralia (557)  absolutely (1423)  like (25)  watching (6)  JONAS (419)  youtube (66)  homework (280)  

Tweet : Shiner is taking up all my bed and blankets!  
Feature Words : is (3020)  taking (85)  bed (65)  blankets (

Feature Words : is (3020)  work (23)  saddened (1184)  Ted (1767)  Baker (1953)  London (48)  not (10)  being (3020)  able (721)  ship (378)  continental (812)  want (4)  hat (1088)  

Tweet : Been to opticians 
Feature Words : Been (3020)  opticians (3730)  

Tweet : @teefury Not seeing it. The Facebook preview? looked all up and down your body.... of your web page Still nvr got shrt U snt. BURROWISH  
Feature Words : Not (10)  seeing (64)  Facebook (81)  preview (1290)  looked (21)  body (563)  web (1121)  page (307)  nvr (3277)  got (1)  shrt (2525)  U (1052)  snt (1477)  BURROWISH (3479)  

Tweet : @ddlovato Wish i could  
Feature Words : @ddlovato (355)  Wish (15)  

Tweet : Nasty budget due and my iphone is being sent to Apple today.  
Feature Words : budget (3281)  iphone (71)  is (3020)  being (3020)  sent (301)  Apple (788)  today (5)  

Tweet : I had to bury my iPod and buy a new one.  
Feature Words : had (69)  iPod (92)  buy (149)  new (18)  

Tweet : @joshuadejong Don't kn

Tweet : Not so good of a morning The lakers lost 
Feature Words : Not (10)  good (2)  morning (57)  lakers (52)  lost (22)  

Tweet : Still doing my homework!!!  
Feature Words : homework (280)  

Tweet : Heard my baby is running a fever and not feeling good. Wishing I was home.  
Feature Words : Heard (101)  baby (129)  is (3020)  running (164)  fever (27)  not (10)  feeling (12)  good (2)  Wishing (15)  was (3020)  home (95)  

Tweet : #Cavs lost 
Feature Words : Cavs (33)  lost (22)  

Tweet : cant be bothered to get a starbuckss if my sis was here get it for me I MISS HER!!! i feel sooooooooo feel like upset 
Feature Words : nt (10)  be (3020)  bothered (777)  get (1)  starbuckss (2008)  sis (292)  was (3020)  get (1)  MISS (19)  feel (12)  feel (12)  like (25)  upset (238)  

Tweet : Crying 
Feature Words : Crying (154)  

Tweet : Fever 
Feature Words : Fever (27)  

Tweet : @_despina my friend took off the obama pin and lit it on fire  
Feature Words : friend (63)  took (85)  oba

Feature Words : sister (243)  told (90)  'm (3020)  n't (10)  have (69)  fever (27)  

Tweet : No ipod today. Ughh  
Feature Words : ipod (92)  today (5)  Ughh (1727)  

Tweet : @MrJRGregory Im 20 - believe me 
Feature Words : m (3020)  20 (589)  believe (183)  

Tweet : Does anyone want to buy me Taylor Swift tickets? I just want to take my sister. And I didn't win them on the radio.  
Feature Words : want (4)  buy (149)  Taylor (34)  Swift (37)  tickets (235)  want (4)  take (85)  sister (243)  n't (10)  win (49)  radio (508)  

Tweet : iPhone update 3.0 scheduled cor hume 17th... They have only 12 hours left 
Feature Words : iPhone (71)  update (209)  3.0 (188)  scheduled (3670)  cor (2711)  hume (1775)  17th (2213)  have (69)  12 (729)  hours (80)  left (75)  

Tweet : xbox live is Down?  
Feature Words : xbox (73)  live (74)  is (3020)  

Tweet : i miss my psp  
Feature Words : miss (19)  psp (138)  

Tweet : @BruceDangle While gloating will be fun 
Feature Words : gloating (2028)

Feature Words : Starbucks (36)  best (2)  way (119)  start (126)  morning (57)  

Tweet : Attending MeM2009 conference in London. Very boring so far  
Feature Words : Attending (600)  MeM2009 (3160)  conference (1385)  London (48)  boring (402)  far (202)  

Tweet : @sickathanavg know how that goes. How old is your son? I have baby fever. I get it every spring my nephew has given me the worst case. 
Feature Words : know (32)  goes (0)  old (110)  is (3020)  son (1245)  have (69)  baby (129)  fever (27)  get (1)  spring (492)  nephew (2310)  has (69)  given (351)  worst (26)  case (343)  

Tweet : My iPod touch is broken the on/off button doesn't work! 
Feature Words : iPod (87)  touch (293)  is (3020)  broken (104)  on/off (2141)  button (900)  n't (10)  work (30)  

Tweet : iphone 
Feature Words : iphone (71)  

Tweet : @ddlovato I will add prayers for your friends dad while I also pray for my own. He has cancer I hate having parents sick 
Feature Words : @ddlovato (355)  will (121)  

Feature Words : rt (1758)  @NASA (986)  NASA (268)  managers (847)  have (69)  decided (1002)  postpone (1253)  launch (373)  space (731)  shuttle (523)  Endeavour (1040)  hydrogen (533)  leak (454)  

Tweet : @sweeetnspicy hiii im on my ipod...i cant fall asleep  
Feature Words : m (3020)  ipod (92)  nt (10)  fall (169)  asleep (302)  

Tweet : Sucks to be me having exactly 0GB left on my iphone. I want new apps! time to live through Abbie 
Feature Words : Sucks (836)  be (3020)  having (69)  exactly (1073)  0 (507)  GB (857)  left (75)  iphone (71)  want (4)  new (18)  apps (344)  time (13)  live (74)  Abbie (1716)  

Tweet : Btw 
Feature Words : Btw (3646)  

Tweet : 6:39am: getting ready to go to school. i'm looking for my PSP though 
Feature Words : 6:39am (3265)  getting (1)  ready (147)  go (0)  school (193)  'm (3020)  looking (21)  PSP (138)  

Tweet : @thecoolestout Ha the sun's already gone  
Feature Words : @thecoolestout (1116)  Ha (3084)  sun (422)  's (3020)  gone (0)  


Tweet : @SashaBaby22 haha! Ps 
Feature Words : haha (136)  Ps (606)  

Tweet : SissyDawnie: @CokieTheCat - Marvelous on 10 cancer-free years!!!!! YAY!! *** Thanks! Yeah. We're all pretty psyched about that!  
Feature Words : SissyDawnie (3117)  Marvelous (2878)  10 (420)  cancer (14)  free (190)  years (43)  YAY (676)  Thanks (61)  Yeah (107)  're (3020)  pretty (168)  psyched (1276)  

Tweet : @sydney_sider yes thanks I think they're amazing too! the images were taken by @insidecuisine photographer the very talented @rovingrob  
Feature Words : yes (231)  thanks (61)  think (8)  're (3020)  amazing (118)  images (675)  were (3020)  taken (85)  photographer (3102)  talented (651)  

Tweet : I wonder what jon thinks when he see's all his tweets 
Feature Words : wonder (236)  jon (3733)  thinks (8)  see (64)  's (3020)  tweets (179)  

Tweet : and now off to bed after an amazing night chatting with a pretty amazing guy ( you know who you are) 
Feature Words : bed (65)  amazing (118)  nig

Feature Words : HAHA (577)  funny (380)  encouraged (2649)  create (1400)  Twitter (161)  

Tweet : @alicayaba so cuuute! hey 
Feature Words : cuuute (2746)  hey (103)  

Tweet : road trip with the boy. so far we have discovered an Obama cafe. haha 
Feature Words : road (724)  trip (172)  boy (248)  far (202)  have (69)  discovered (1440)  Obama (47)  cafe (2115)  haha (136)  

Tweet : @Ayerad no 
Feature Words : 

Tweet : @onehunnidt Amen to that. I hate the Lakers so I really hope Orlando takes it cuz the Cavs lost it  
Feature Words : Amen (1790)  hate (54)  Lakers (40)  hope (106)  Orlando (426)  takes (85)  cuz (994)  Cavs (33)  lost (22)  

Tweet : very sad after cavs loss  
Feature Words : sad (9)  cavs (835)  loss (550)  

Tweet : Hoping that just a single #squarespace might get me another iPhone. I need a 3G to test against !! Tired of sacrificing my personal phone  
Feature Words : Hoping (189)  single (522)  squarespace (524)  get (1)  iPhone (24)  need (42)  3 (46)  G (706)

Feature Words : Ugh (3467)  Las (2528)  vegas (108)  airport (512)  is (3020)  quot;ground (3473)  stop&quot (1617)  means (211)  'm (3020)  stuck (485)  plane (1258)  tarmac (2582)  LAX (1532)  

Tweet : xbox live is down no netflix 
Feature Words : xbox (73)  live (74)  is (3020)  

Tweet : Gpsphone is no longer free by the update it asks for a donation  
Feature Words : Gpsphone (3782)  is (3020)  longer (311)  free (190)  update (209)  asks (352)  donation (3333)  

Tweet : mad because i have to watch Obama give his speech on why GM is now Obama motors while i should be watching the showcase on price is right  
Feature Words : mad (668)  have (69)  watch (6)  Obama (47)  give (351)  speech (863)  GM (3366)  is (3020)  Obama (47)  motors (2098)  be (3020)  watching (6)  price (451)  is (3020)  right (247)  

Tweet : this is now two in the row - burnt starbucks coffeee #burntcoffee 
Feature Words : is (3020)  row (664)  burnt (468)  starbucks (36)  coffeee (2346)  burntcoffee (2937) 

Feature Words : monkeys (2257)  found (39)  twin (610)  wo (88)  nt (10)  write (325)  'm (3020)  heartbroken (962)  

Tweet : The new PSP looks crap! Such a disappointment after the conceot pics that were floating around  
Feature Words : new (18)  PSP (83)  looks (21)  crap (200)  disappointment (1315)  conceot (3215)  pics (177)  were (3020)  floating (3250)  

Tweet : @enia59 Aw 
Feature Words : Aw (768)  

Tweet : My soarthroats and coughs have been joined by fellow fever and flue. It's like a gathering of illnesses!... Did I get struck by the H1N1?  
Feature Words : soarthroats (1938)  coughs (1058)  have (69)  been (3020)  joined (810)  fellow (1304)  fever (27)  flue (3687)  's (3020)  like (25)  gathering (2460)  get (1)  struck (622)  H1N1 (2754)  

Tweet : I cant sleep and i think im getting a fever  
Feature Words : nt (10)  sleep (165)  think (8)  m (3020)  getting (1)  fever (27)  

Tweet : I feel like shit. This is NOT the way I want to spend my birthday's eve  
Feature 

Feature Words : Xbox (59)  live (74)  is (3020)  

Tweet : Sleep time. Tomorrow is gonna suck  
Feature Words : time (13)  Tomorrow (93)  is (3020)  gon (114)  na (84)  suck (91)  

Tweet : GAH! I hate it when my children get a fever in the summer stupid bugs! STUPID WEATHER!!! 
Feature Words : GAH (802)  hate (54)  children (754)  get (1)  fever (27)  summer (287)  stupid (155)  bugs (3702)  STUPID (155)  WEATHER (694)  

Tweet : Just lost $160  
Feature Words : lost (22)  $ (303)  160 (3232)  

Tweet : sad that the 'feet' of my macbook just fell off : sad that the 'feet' of my macbook just fell off  
Feature Words : sad (9)  feet (560)  macbook (807)  fell (169)  sad (9)  feet (560)  macbook (807)  fell (169)  

Tweet : London had to get five shots today I felt terrible were both miserable him from the shots me from my root canal  
Feature Words : London (48)  had (69)  get (1)  shots (742)  today (5)  felt (12)  terrible (431)  were (3020)  miserable (433)  shots (742)  root (2653) 

Feature Words : is (3020)  hacked (640)  wonder (236)  fuck (546)  sold (279)  Oh (28)  know (32)  was (3020)  white (670)  thought (8)  was (3020)  ugly (494)  PSP (83)  

Tweet : gutted that our trip to london NEVER happened. maybe another time ...  
Feature Words : gutted (648)  trip (172)  london (842)  happened (212)  maybe (197)  time (13)  

Tweet : Now on park and ride bus back to my car but haven't got my Ipod today  
Feature Words : park (776)  ride (3494)  bus (920)  car (230)  haven't (608)  got (1)  Ipod (87)  today (5)  

Tweet : oh... burning up. this is not good. fever  
Feature Words : oh (28)  burning (468)  is (3020)  not (10)  good (2)  fever (27)  

Tweet : @FollowSavvy I never found her. everytime I click on her twitter thing through your myspace..... it goes to some dude's page  
Feature Words : found (39)  everytime (366)  click (2100)  twitter (113)  thing (105)  goes (0)  dude (1150)  page (307)  

Tweet : I am officially banning godaddy.com from my comp. My h

Feature Words : Retweeting (1391)  Reuters (3621)  Actress (2606)  Farrah (226)  Fawcett (221)  dies (124)  cancer (14)  

Tweet : @katebornstein which is pretty anti memorial tattoos but for all but the strictest 
Feature Words : is (3020)  pretty (168)  anti (3446)  memorial (2358)  tattoos (1007)  strictest (3623)  

Tweet : @coollike Hey how much friends have you got on Xbox cause your friends list is full it would not let me add you.  
Feature Words : Hey (103)  friends (63)  have (69)  got (1)  Xbox (59)  cause (364)  friends (63)  list (2400)  is (3020)  would (194)  not (10)  let (130)  add (242)  

Tweet : @alielayus I want to go to promote GEAR AND GROOVE but unfornately no ride there I may b going to the one in Anaheim in May though 
Feature Words : want (4)  go (0)  promote (2276)  GEAR (1586)  GROOVE (2111)  unfornately (2876)  ride (336)  b (1043)  going (0)  Anaheim (1562)  

Tweet : Is feeling alittle better but is in a hayfever craze  
Feature Words : Is (3020)  feelin

Tweet : hey guess was @magicmanil the Lakers won and KOBE is mvp just thought I would tell ya haha 
Feature Words : hey (103)  guess (570)  was (3020)  Lakers (40)  won (49)  KOBE (213)  is (3020)  thought (8)  would (194)  tell (90)  ya (505)  haha (136)  

 Word-Counts as a sparse matrix
  (0, 0)	1
  (0, 19)	1
  (0, 153)	1
  (0, 1324)	1
  (0, 3020)	1
  (0, 3270)	1
  (1, 1)	1
  (1, 21)	1
  (1, 26)	1
  (1, 35)	1
  (1, 38)	1
  (1, 140)	1
  (1, 142)	1
  (1, 339)	1
  (1, 422)	1
  (1, 548)	1
  (1, 727)	1
  (1, 2483)	1
  (1, 3020)	2
  (1, 3573)	1
  (2, 1307)	1
  (2, 2195)	1
  (2, 2441)	1
  (2, 2838)	1
  (2, 3293)	1
  :	:
  (2016, 66)	1
  (2016, 77)	1
  (2016, 124)	1
  (2016, 464)	1
  (2016, 3020)	1
  (2017, 53)	1
  (2017, 83)	1
  (2018, 2)	1
  (2018, 3)	1
  (2018, 51)	1
  (2018, 57)	1
  (2018, 233)	1
  (2018, 783)	1
  (2018, 3020)	1
  (2019, 8)	1
  (2019, 40)	1
  (2019, 49)	1
  (2019, 90)	1
  (2019, 103)	1
  (2019, 136)	1
  (2019, 194)	1
  (2019, 213)	1
  (2019, 505)	1
  (2019, 570)	1
  (20

In [18]:
print tweets2features(['xbox is boring. I would rather have a psp than an xbox'] ,vb=True,idf=True)



Tweet : xbox is boring. I would rather have an psp than an xbox 
Feature Words : xbox (73)  is (3020)  boring (402)  would (194)  have (69)  psp (138)  xbox (73)  

 Word-Counts as a sparse matrix
  (0, 69)	1
  (0, 73)	2
  (0, 138)	1
  (0, 194)	1
  (0, 402)	1
  (0, 3020)	1
  (0, 3020)	0.120873649125
  (0, 402)	0.447919718278
  (0, 194)	0.317167734246
  (0, 138)	0.383553411694
  (0, 73)	0.708532199142
  (0, 69)	0.187165751233
