In [3]:
import pandas as pd
from pathlib import Path
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer 


## Data importation
Technical difficulties prevented me from importing full dataset. Despite encoding of the original csv in UTC-8, 
Jupyter Lab insisted the original csv. I was able to open the original csv in Excel, export it as a new csv specified to be encoded in UTC-8, and then import that new csv in Jupyter Lab successfully. This limited the dataset to 1M lines, removing some 600,000 rows. This should still be a sizable dataset for analysis, however as the original data was ordered by sentiment, this removed a significant amount of positive sentimented text.

In [4]:
filepath = Path("training8.csv")
tweets = pd.read_csv(filepath, header=None, names=["sentiment", "id", "time", "query", "name", "text"])
tweets

Unnamed: 0,sentiment,id,time,query,name,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
...,...,...,...,...,...,...
999995,4,1879942807,Thu May 21 23:36:19 PDT 2009,NO_QUERY,divabat,"@healingsinger thank you, i needed that"
999996,4,1879942922,Thu May 21 23:36:20 PDT 2009,NO_QUERY,nick1975,@vactress http://bit.ly/cADea Maybe this is m...
999997,4,1879942975,Thu May 21 23:36:21 PDT 2009,NO_QUERY,znmeb,"@Brat13 Hell, Windows 7 will be out of my pric..."
999998,4,1879943113,Thu May 21 23:36:22 PDT 2009,NO_QUERY,virmani,@jigardoshi neah.. i wish! just reminiscing r...


In [5]:
#Does column 'NO_QUERY' contain any useful data or is every row just 'NO_QUERY'?
tweets['query'].value_counts()

NO_QUERY    1000000
Name: query, dtype: int64

In [6]:
# Tweet id's and query have no data useful for sentiment analysis. There could be something in 'time,' but
# without timezone data it can't be determined what time of day tweets were sent. Names are unlikely to be useful.
tweets = tweets.drop(columns=['id', 'time', 'query', 'name'])
tweets.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [60]:
# data cleaning ideas:
# count tokens beginning with "@," store as new column "reply_depth," remove tokens"
#     keep count of "@" token 
# count tokens beginning with "http://," store as new column "links," remove tokens"
#     keep count of "http" token

#     word_tokenize 
#     

reply_depth = []

for row in tweets:
    tweet_text = row[2]
    mention_pattern = r"@\w+"
    mention_count = len(re.findall(mention_pattern, tweet_text))
    reply_depth.append(mention_count)
    
tweets['reply_depth'] = pd.Series(reply_depth)



In [61]:
tweets.head()

Unnamed: 0,sentiment,text,reply_depth
0,0,@ switchfoot httptwitpiccomyzl awww thats bumm...,0.0
1,0,upset cant update facebook texting might cry r...,0.0
2,0,@ kenichan dived many time ball managed save r...,0.0
3,0,whole body feel itchy like fire,
4,0,@ nationwideclass behaving im mad cant see,


In [52]:
tweets.iloc[0:2]

Unnamed: 0,sentiment,text,reply_depth
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",0.0
1,0,is upset that he can't update his Facebook by ...,0.0


In [56]:
# Lemmatizer script
lemmatizer = WordNetLemmatizer()

def process_text(text): 
    sw = set(stopwords.words('english')) 
    regex = re.compile("[^a-zA-Z@ ]") 
    re_clean = regex.sub('', text) 
    words = word_tokenize(re_clean) 
    lem = [lemmatizer.lemmatize(word) for word in words] 
    output = ' '.join([word.lower() for word in lem if word.lower() not in sw]) 
    return output

In [57]:
tweets['text'] = tweets['text'].apply(lambda x: process_text(x))

In [58]:
tweets.head()

Unnamed: 0,sentiment,text,reply_depth
0,0,@ switchfoot httptwitpiccomyzl awww thats bumm...,0.0
1,0,upset cant update facebook texting might cry r...,0.0
2,0,@ kenichan dived many time ball managed save r...,0.0
3,0,whole body feel itchy like fire,
4,0,@ nationwideclass behaving im mad cant see,
