In [1]:
import pandas as pd
from applied_data_science.sentiment_pipeline import SentimentPipeline

In [2]:
df = pd.read_csv('data.csv', encoding = 'latin', header=None) # load example data
df.columns = ['sentiment', 'id', 'date', 'query', 'user_id', 'text'] # set column names
df = df.drop(['query', 'user_id', 'sentiment'], axis=1) # drop extra columns
df['date'] = pd.to_datetime(df['date'], format="%a %b %d %H:%M:%S PDT %Y") # parse dates e.g. Tue Jun 16 08:40:50 PDT 2009
df.head(10)

Unnamed: 0,id,date,text
0,1467810369,2009-04-06 22:19:45,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1467810672,2009-04-06 22:19:49,is upset that he can't update his Facebook by ...
2,1467810917,2009-04-06 22:19:53,@Kenichan I dived many times for the ball. Man...
3,1467811184,2009-04-06 22:19:57,my whole body feels itchy and like its on fire
4,1467811193,2009-04-06 22:19:57,"@nationwideclass no, it's not behaving at all...."
5,1467811372,2009-04-06 22:20:00,@Kwesidei not the whole crew
6,1467811592,2009-04-06 22:20:03,Need a hug
7,1467811594,2009-04-06 22:20:03,@LOLTrish hey long time no see! Yes.. Rains a...
8,1467811795,2009-04-06 22:20:05,@Tatiana_K nope they didn't have it
9,1467812025,2009-04-06 22:20:09,@twittera que me muera ?


In [3]:
# set workers <= CPU cores, set chunksize based on dataset size (my data has 1.6 million tweets)
# threshold reccommended by VADER docs, change labels if u want
pipe = SentimentPipeline(workers=10, chunksize=100000, threshold=0.05, pos_label=1, neu_label=0, neg_label=-1) 
output = pipe(df['text'])
output.head(10)


CLEANING
100%|██████████| 1600000/1600000 [00:01<00:00, 876691.35it/s]

ANALYSING
100%|██████████| 1600000/1600000 [00:10<00:00, 147925.56it/s]

CLASSIFYING
100%|██████████| 1600000/1600000 [00:00<00:00, 2518399.11it/s]

DONE


Unnamed: 0,clean_text,sentiment_score,sentiment
0,"- Awww, that's a bummer. You shoulda got Davi...",-0.0173,0
1,is upset that he can't update his Facebook by ...,-0.75,-1
2,I dived many times for the ball. Managed to s...,0.4939,1
3,my whole body feels itchy and like its on fire,-0.25,-1
4,"no, it's not behaving at all. i'm mad. why am...",-0.4939,-1
5,not the whole crew,0.0,0
6,Need a hug,0.4767,1
7,"hey long time no see! Yes.. Rains a bit ,only...",0.6208,1
8,nope they didn't have it,0.0,0
9,que me muera ?,0.0,0


In [4]:
# how to merge output with exisiting df
df = df.merge(output, how = 'left', left_index = True, right_index = True)
df.head(10)

Unnamed: 0,id,date,text,clean_text,sentiment_score,sentiment
0,1467810369,2009-04-06 22:19:45,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- Awww, that's a bummer. You shoulda got Davi...",-0.0173,0
1,1467810672,2009-04-06 22:19:49,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,-0.75,-1
2,1467810917,2009-04-06 22:19:53,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...,0.4939,1
3,1467811184,2009-04-06 22:19:57,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,-0.25,-1
4,1467811193,2009-04-06 22:19:57,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...",-0.4939,-1
5,1467811372,2009-04-06 22:20:00,@Kwesidei not the whole crew,not the whole crew,0.0,0
6,1467811592,2009-04-06 22:20:03,Need a hug,Need a hug,0.4767,1
7,1467811594,2009-04-06 22:20:03,@LOLTrish hey long time no see! Yes.. Rains a...,"hey long time no see! Yes.. Rains a bit ,only...",0.6208,1
8,1467811795,2009-04-06 22:20:05,@Tatiana_K nope they didn't have it,nope they didn't have it,0.0,0
9,1467812025,2009-04-06 22:20:09,@twittera que me muera ?,que me muera ?,0.0,0


In [5]:

# with open('clean.pkl', 'rb') as f:
#     df = pickle.load(f)

# def filter_by_keywords(df, keywords):
#     return df[pd.DataFrame(df.text.tolist()).isin(keywords).any(1).values]

# def time_series(df, keywords=[]):
#     if keywords:
#         df = filter_by_keywords(df, keywords)
#     df = df.set_index(df.date)
#     df = df.drop(['id', 'text', 'date'], axis=1)
#     # print(df[df.sentiment==4].head(100))

#     neg = df[df.sentiment==0]
#     pos = df[df.sentiment==4]

#     pos = pos.resample('D').agg({'sentiment':'size'}).fillna(0) 
#     pos = pos.rename(columns={'sentiment':'count'})


#     neg = neg.resample('D').agg({'sentiment':'size'}).fillna(0) 
#     neg = neg.rename(columns={'sentiment':'count'})
#     # df = df.resample('D').agg({'sentiment':'size'}).fillna(0)
#     # df.rename(columns={"date": "count"})


#     # neg.set_index(pos.date, inplace=True)
#     # neg = pos.resample('D').sum().fillna(0)
#     return pos, neg

# # print(df.head(100000))
# pos, neg = time_series(df)
# ax = plt.gca()
# pos.plot(kind='line',y='count', ax=ax)
# neg.plot(kind='line',y='count', ax=ax)
# ax.legend(["Positive", "Negative"])

# plt.show()