In [1]:
from sklearn.preprocessing import LabelEncoder
from textblob import TextBlob
import pandas as pd
import numpy as np
import re

In [2]:
df = pd.read_csv('../csv/elonmusk_cleaned.csv', encoding='UTF-8')
df.head()

Unnamed: 0,ID,Date,Tweet,Retweets,Tweet Length
0,0,2010-06-05,"Please ignore prior tweets, as that was someon...",5453,106
1,1,2011-12-01,Went to Iceland on Sat to ride bumper cars on ...,188,129
2,2,2011-12-01,I made the volume on the Model S http://ow.ly/...,78,125
3,3,2011-12-03,"Great Voltaire quote, arguably better than Twa...",44,145
4,4,2011-12-03,That was a total non sequitur btw\n26\n14\n50,50,42


In [3]:
# Remove links from tweet
def clean_tweet_link(dirty_string):
    https = "https://" 
    http = "http://"
    if http in dirty_string:
        clean = dirty_string.split(http, 1)[0]
    elif https in dirty_string:
        clean = dirty_string.split(https, 1)[0]
    else:
        clean = dirty_string
    return clean

In [4]:
cleaned_tweet1 = df
cleaned_tweet1['Tweet'] = df['Tweet'].apply(clean_tweet_link)

In [5]:
cleaned_tweet1.head()

Unnamed: 0,ID,Date,Tweet,Retweets,Tweet Length
0,0,2010-06-05,"Please ignore prior tweets, as that was someon...",5453,106
1,1,2011-12-01,Went to Iceland on Sat to ride bumper cars on ...,188,129
2,2,2011-12-01,I made the volume on the Model S,78,125
3,3,2011-12-03,"Great Voltaire quote, arguably better than Twa...",44,145
4,4,2011-12-03,That was a total non sequitur btw\n26\n14\n50,50,42


In [6]:
def clean_tweet_all(dirty_string):
        # Remove replies
        dirty_string = re.sub('Replying to', '', dirty_string)
        # Remove @username
        dirty_string = re.sub('@([A-Za-z0-9]+)', '', dirty_string)
        # Remove last three lines - unnecessary info
        dirty_string = re.sub('\n([0-9]+)\n([0-9]+)\n([0-9]+)', '', dirty_string)
        # Remove hashtags
        dirty_string = re.sub('\s#([A-Za-z0-9_]+)', '', dirty_string)
        # Remove symbols
        dirty_string = re.sub('([^A-Za-z0-9 ])', '', dirty_string)
        # Remove enters
        clean = re.sub('\n', ' ', dirty_string)
        
        return clean

In [7]:
cleaned_tweet2 = cleaned_tweet1
cleaned_tweet2['Tweet'] = cleaned_tweet1['Tweet'].apply(clean_tweet_all)

In [8]:
cleaned_tweet2[40:55]

Unnamed: 0,ID,Date,Tweet,Retweets,Tweet Length
40,40,2011-12-30,Right mood scented candles Barry White singing...,517,151
41,41,2011-12-29,Just returned from Haiti For those who want ...,31,166
42,42,2011-12-29,Liked Screw Business as Usual a lot This appr...,119,143
43,43,2011-12-29,So true,22,52
44,44,2011-12-29,Cool personal essay It really resonated with...,31,162
45,45,2012-01-02,However Chinas real estate crisis will explo...,6,170
46,46,2012-01-02,Yeah 2012 will be great for the American eco...,24,165
47,47,2012-01-07,Model S Signature series sold out as of today,10,82
48,48,2012-01-03,What everyone really thinks RT If Mayans wer...,29,123
49,49,2012-01-07,Next month is also when our Dragon spaceship d...,48,125


In [9]:
def get_tweet_sentiment(tweet):
    analysis = TextBlob(tweet.lower())
    # Get sentiment
    if analysis.sentiment.polarity > 0:
        return 1, analysis.sentiment.polarity
    elif analysis.sentiment.polarity == 0: 
        return 0, analysis.sentiment.polarity
    else: 
        return -1, analysis.sentiment.polarity

In [10]:
# Extract tweet (Sentiment, Polarity)
tmp_list = cleaned_tweet2['Tweet'].apply(get_tweet_sentiment).tolist()
tmp_df = pd.DataFrame(tmp_list)
tmp_df.columns = ['Sentiment', 'Polarity']

# Append Sentiment and Polarity into final dataset
df_final = cleaned_tweet2
df_final['Sentiment'] = tmp_df['Sentiment']
df_final['Polarity'] = tmp_df['Polarity']

In [11]:
df_final.head()

Unnamed: 0,ID,Date,Tweet,Retweets,Tweet Length,Sentiment,Polarity
0,0,2010-06-05,Please ignore prior tweets as that was someone...,5453,106,0,0.0
1,1,2011-12-01,Went to Iceland on Sat to ride bumper cars on ...,188,129,1,0.65
2,2,2011-12-01,I made the volume on the Model S,78,125,0,0.0
3,3,2011-12-03,Great Voltaire quote arguably better than Twai...,44,145,1,0.45
4,4,2011-12-03,That was a total non sequitur btw,50,42,0,0.0


In [12]:
df_final.to_csv('../csv/base_elon_data.csv')