**Twitter Sentiment Analysis**
---

In [1]:
# imports 
import pandas as pd
import numpy as np

In [26]:
# Load the data

data = 'tweets.txt'
# df = pd.read_csv(data, sep='\t', header=None)
# df.columns = ['tweet']

In [27]:
import json

In [58]:
# treat each line as a dictionary
tweets = []
for line in open(data, 'r'):
    tweet = json.loads(line)
    try:
        #print(tweet.get('quoted_status').get('text'))
        desired = tweet.get('quoted_status').get('text')
        tweets.append(desired)
    except AttributeError:
        pass

    
    #print(tweet)

In [59]:
# add the tweets to a dataframe
df = pd.DataFrame(tweets)
df.columns = ['tweet']

In [60]:
df.head()

Unnamed: 0,tweet
0,gonna make a peach acc too :)
1,@RebeccaFMusic It would be amazing if one day ...
2,you woke up in a middle of a night and noticed...
3,All of our tagline and hashtags are trending N...
4,All of our tagline and hashtags are trending N...


In [61]:
# create new column 'mood' and takes happy :) or sad :( emoticons fro the tweets
df['mood'] = df['tweet'].apply(lambda tweet: ':)' if ':)' in tweet else ':(')

In [62]:
df.head()

Unnamed: 0,tweet,mood
0,gonna make a peach acc too :),:)
1,@RebeccaFMusic It would be amazing if one day ...,:(
2,you woke up in a middle of a night and noticed...,:(
3,All of our tagline and hashtags are trending N...,:)
4,All of our tagline and hashtags are trending N...,:)


In [63]:
# create new column 'text' and takes the text from the tweets without the emoticons
df['text'] = df['tweet'].apply(lambda tweet: tweet.replace(':)', '').replace(':(', ''))

In [65]:
df.drop('tweet', axis=1, inplace=True)

In [66]:
df.head()

Unnamed: 0,mood,text
0,:),gonna make a peach acc too
1,:(,@RebeccaFMusic It would be amazing if one day ...
2,:(,you woke up in a middle of a night and noticed...
3,:),All of our tagline and hashtags are trending N...
4,:),All of our tagline and hashtags are trending N...


In [97]:
df.shape

(377, 2)

In [67]:
# clean the text column
import re
def clean_text(text):
    text = re.sub(r'@\w+', '', text) # remove mentions
    text = re.sub(r'#\w+', '', text) # remove hashtags
    text = re.sub(r'http\S+', '', text) # remove links
    text = re.sub(r'\d+', '', text) # remove digits
    text = text.strip('\'"') # remove quotation marks
    text = text.lower() # lower case
    text = re.sub(r'[^a-zA-Z0-9]', ' ', text) # remove special characters
    text = re.sub(r'\s+', ' ', text) # remove multiple spaces
    # remove punctuation
    text = text.replace('[^\w\s]','')
    return text

In [68]:
# apply the clean_text function to the text column
df['text'] = df['text'].apply(lambda tweet: clean_text(tweet))

In [69]:
df.head()

Unnamed: 0,mood,text
0,:),gonna make a peach acc too
1,:(,it would be amazing if one day the industry w...
2,:(,you woke up in a middle of a night and noticed...
3,:),all of our tagline and hashtags are trending n...
4,:),all of our tagline and hashtags are trending n...


In [70]:
# Processing 

# For tokenization
import nltk

# For converting words into frequency counts
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [106]:
# First step in pipeline
# Keep words that appear in atleast 2 documents, keeps 5000 most common words
preprocessor = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features=1000, binary=True)
# preprocessor = TfidfVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features=5000)

In [116]:
# Demo the preprocessor:
X_example = preprocessor.fit_transform(df['text'].iloc[:1000])
print(f'Preprocessing output shape: {X_example.shape}')

# Show the process for the first datapoint
first_datapoint = df['text'].iloc[0]
print(f'First datapoint: {first_datapoint[:100]}')

first_tokens = nltk.word_tokenize(first_datapoint)
print(f'First datapoint tokens: {first_tokens[:10]}')

first_bow = preprocessor.transform([first_datapoint])
first_bow.maxprint = 5  # Change how many of the non-zero elements are printing to not clutter the notebook
print(f'First datapoint Binary Bag of Words (sparse) representation:\n{first_bow}')
print(f'First datapoint Binary Bag of Words (dense) representation:\n{first_bow.todense()}')

preprocessor.vocabulary_

Preprocessing output shape: (377, 533)
First datapoint: gonna make a peach acc too 
First datapoint tokens: ['gon', 'na', 'make', 'a', 'peach', 'acc', 'too']
First datapoint Binary Bag of Words (sparse) representation:
  (0, 0)	1
  (0, 182)	1
  (0, 263)	1
  (0, 287)	1
  (0, 450)	1
First datapoint Binary Bag of Words (dense) representation:
[[1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 



{'gon': 182,
 'na': 287,
 'make': 263,
 'a': 0,
 'too': 450,
 'it': 221,
 'would': 514,
 'be': 48,
 'amazing': 23,
 'if': 209,
 'one': 305,
 'day': 110,
 'the': 427,
 'industry': 215,
 'let': 244,
 'artists': 38,
 'completely': 92,
 'free': 164,
 'unfortunately': 465,
 'they': 432,
 'you': 528,
 'up': 467,
 'in': 212,
 'of': 301,
 'night': 295,
 'and': 26,
 'at': 41,
 'your': 530,
 'all': 17,
 'our': 312,
 'tagline': 418,
 'hashtags': 192,
 'are': 33,
 'trending': 455,
 'nationwide': 288,
 'worldwide': 512,
 'great': 184,
 'job': 229,
 'fam': 147,
 's': 361,
 'keep': 237,
 'on': 304,
 'tweeting': 459,
 'f': 145,
 'most': 280,
 'with': 507,
 'pls': 331,
 'its': 222,
 'so': 392,
 'here': 200,
 'is': 220,
 'replay': 354,
 'artist': 37,
 'out': 313,
 'may': 267,
 'will': 506,
 'to': 447,
 'help': 197,
 'rt': 360,
 'wts': 517,
 'lfb': 246,
 'ph': 324,
 'photocards': 325,
 'clean': 78,
 'amp': 24,
 'onhand': 306,
 'check': 75,
 'i': 208,
 'think': 435,
 'community': 89,
 'san': 366,
 'absolu

# Modeling

In [113]:
from sklearn.naive_bayes import BernoulliNB # Bernoulli because we have binary features
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('preprocessing', preprocessor), 
                     ('model', BernoulliNB())])

In [114]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'], df['mood'], test_size=0.30, random_state=27)

In [115]:
pipeline.fit(X_train, y_train)
train_accuracy = pipeline.score(X_train, y_train)
test_accuracy = pipeline.score(X_test, y_test)

print(f'Train accuracy:\t{train_accuracy}')
print(f'Test accuracy:\t{test_accuracy}')

Train accuracy:	0.9011406844106464
Test accuracy:	0.8596491228070176




In [None]:
# make predictions to text

text = 

# Method 2:

In [102]:
# Set up the TF-IDF vectorizer
# Keep words that appear in atleast 2 documents, keeps 5000 most common words
preprocessor = TfidfVectorizer(min_df=2, tokenizer=nltk.word_tokenize, max_features=1000, ngram_range=(1,2), stop_words='english')

In [103]:
# fit and transform data
X = preprocessor.fit_transform(df['text'])



In [104]:
# split data
X_train, X_test, y_train, y_test = train_test_split(X.toarray(), df['mood'], test_size=0.20, random_state=27)

In [105]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()
model.fit(X_train,y_train)

train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)

print(f'Train accuracy:\t{train_accuracy}')
print(f'Test accuracy:\t{test_accuracy}')

Train accuracy:	0.840531561461794
Test accuracy:	0.7368421052631579


In [None]:
# hyperparameter search using grid CV
