# 8. Twitter Sentiment Analysis

In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

In [2]:
train_data = pd.read_csv('data/train_tweets.csv')

In [3]:
train_data.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [4]:
train_data['label'].value_counts()

0    29720
1     2242
Name: label, dtype: int64

## Preparing Training Data

In [5]:
def drop_features(features, data):
    data.drop(features, inplace=True, axis=1)

In [6]:
def clean_tweet(tweet):
    return " ".join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])", " ",tweet.lower()).split())

In [7]:
train_data['cleaned_tweet'] = train_data['tweet'].apply(clean_tweet)

In [8]:
train_data.head()

Unnamed: 0,id,label,tweet,cleaned_tweet
0,1,0,@user when a father is dysfunctional and is s...,when a father is dysfunctional and is so selfi...
1,2,0,@user @user thanks for #lyft credit i can't us...,thanks for lyft credit i can t use cause they ...
2,3,0,bihday your majesty,bihday your majesty
3,4,0,#model i love u take with u all the time in ...,model i love u take with u all the time in ur
4,5,0,factsguide: society now #motivation,factsguide society now motivation


In [9]:
drop_features(['id','tweet'],train_data)

In [10]:
train_data.head()

Unnamed: 0,label,cleaned_tweet
0,0,when a father is dysfunctional and is so selfi...
1,0,thanks for lyft credit i can t use cause they ...
2,0,bihday your majesty
3,0,model i love u take with u all the time in ur
4,0,factsguide society now motivation


## Splitting training data into training and test set

In [11]:
x_train, x_test, y_train, y_test = train_test_split(train_data['cleaned_tweet'], train_data['label'], test_size=0.2, 
random_state=0)

### CountVectorizer
Convert a collection of text documents to a matrix of token counts.
### TdidfTransformer
Transform a count matrix to a normalized tf or tf-idf representation.<br>
**tf**: Term-frequency<br>
**tf-idf**: Term-frequency times inverse document-frequency<br>
Normalization is cosine when ``norm='l2'``
<br><br>
**Note**: The TfidfTransformer **transforms a count matrix** to a **normalized** tf or tf-idf representation. So although both the CountVectorizer and TfidfTransformer produce term frequencies, TfidfTransformer is normalizing the count.

In [12]:
count_vec = CountVectorizer(stop_words='english')
transformer = TfidfTransformer(norm='l2', sublinear_tf=True)

In [13]:
x_train_counts = count_vec.fit_transform(x_train)
x_train_tfidf = transformer.fit_transform(x_train_counts)

In [14]:
print(x_train_counts.shape)
print(x_train_tfidf.shape)

(25569, 33693)
(25569, 33693)


In [15]:
x_test_counts = count_vec.transform(x_test)
x_test_tfidf = transformer.transform(x_test_counts)

In [16]:
print(x_test_counts.shape)
print(x_test_tfidf.shape)

(6393, 33693)
(6393, 33693)


In [17]:
clf = RandomForestClassifier(n_estimators=200)
clf.fit(x_train_tfidf, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [18]:
y_pred = clf.predict(x_test_tfidf)
print(f1_score(y_test, y_pred))

0.6687306501547987


## Preparing test data

In [19]:
test_data = pd.read_csv('data/test_tweets.csv')

In [20]:
test_data.head()

Unnamed: 0,id,tweet
0,31963,#studiolife #aislife #requires #passion #dedic...
1,31964,@user #white #supremacists want everyone to s...
2,31965,safe ways to heal your #acne!! #altwaystohe...
3,31966,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew..."


In [21]:
test_data['cleaned_tweet'] = test_data['tweet'].apply(clean_tweet)

In [22]:
test_data.head()

Unnamed: 0,id,tweet,cleaned_tweet
0,31963,#studiolife #aislife #requires #passion #dedic...,studiolife aislife requires passion dedication...
1,31964,@user #white #supremacists want everyone to s...,white supremacists want everyone to see the ne...
2,31965,safe ways to heal your #acne!! #altwaystohe...,safe ways to heal your acne altwaystoheal heal...
3,31966,is the hp and the cursed child book up for res...,is the hp and the cursed child book up for res...
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",3rd bihday to my amazing hilarious nephew eli ...


In [23]:
drop_features(['tweet'], test_data)

In [24]:
test_data.head()

Unnamed: 0,id,cleaned_tweet
0,31963,studiolife aislife requires passion dedication...
1,31964,white supremacists want everyone to see the ne...
2,31965,safe ways to heal your acne altwaystoheal heal...
3,31966,is the hp and the cursed child book up for res...
4,31967,3rd bihday to my amazing hilarious nephew eli ...


## Tokenizing and normalizing training data (train_tweets.csv) and test data (test_tweet.csv)

In [25]:
train_counts = count_vec.fit_transform(train_data['cleaned_tweet'])
test_counts = count_vec.transform(test_data['cleaned_tweet'])

In [26]:
print(train_counts.shape)
print(test_counts.shape)

(31962, 38763)
(17197, 38763)


In [27]:
train_tfidf = transformer.fit_transform(train_counts)
test_tfidf = transformer.transform(test_counts)

In [28]:
print(train_tfidf.shape)
print(test_tfidf.shape)

(31962, 38763)
(17197, 38763)


In [29]:
clf.fit(train_tfidf, train_data['label'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [30]:
pred = clf.predict(test_tfidf)

In [32]:
result = pd.DataFrame({'id': test_data['id'], 'label': pred})
result.to_csv('output.csv', index=False)