In [28]:
import pandas as pd
import nltk
import numpy as np
import matplotlib.pyplot as plt
import re
from nltk.tokenize import WordPunctTokenizer
import warnings
warnings.filterwarnings("ignore")
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn import naive_bayes
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import operator

In [2]:
data = pd.read_csv("./import_tweets1.csv")

In [3]:
data.head()

Unnamed: 0,tweet_id,created_on,text
0,986283984303816000,Tue Apr 17 16:43:20 +0000 2018,"@u6239 @Argyll_IslesApp @Arsenal Wow, brave pe..."
1,986283986149404000,Tue Apr 17 16:43:21 +0000 2018,"RT @PeterMooreLFC: They have a stadium, we hav..."
2,986283986480717000,Tue Apr 17 16:43:21 +0000 2018,RT @BarcaTimes: ðŸ“°[SPORT] | King of Spain Ph...
3,986283990842781000,Tue Apr 17 16:43:22 +0000 2018,RT @LFC: We are Liverpool. The story continues...
4,986283991090286000,Tue Apr 17 16:43:22 +0000 2018,Panathinaikos clear ex-Chelsea star Michael Es...


In [4]:
df = data[['text']]

In [5]:
for i in range(0, len(df)):
    df.text[i] = re.sub(r'@[A-Za-z0-9]+','',str(df.text[i]))

In [6]:
df.head()

Unnamed: 0,text
0,"_IslesApp Wow, brave people ðŸ™„ Bit cold fo..."
1,"RT : They have a stadium, we have a home. They..."
2,RT : ðŸ“°[SPORT] | King of Spain Philippe VI c...
3,RT : We are Liverpool. The story continues... ...
4,Panathinaikos clear ex-Chelsea star Michael Es...


In [8]:
for t in df: 
    df[t].replace(u"\ufffd", "?")

In [9]:
tok = WordPunctTokenizer()
pat1 = r'@[A-Za-z0-9]+'
pat2 = r'https?://[A-Za-z0-9./]+'
combined_pat = r'|'.join((pat1, pat2))
def tweet_cleaner(text):
    soup = BeautifulSoup(text, 'lxml')
    souped = soup.get_text()
    stripped = re.sub(combined_pat, '', souped)
    try:
        clean = stripped.decode("utf-8-sig").replace(u"\ufffd", "?")
    except:
        clean = stripped
    letters_only = re.sub("[^a-zA-Z]", " ", clean)
    lower_case = letters_only.lower()
    # During the letters_only process two lines above, it has created unnecessay white spaces,
    # I will tokenize and join together to remove unneccessary white spaces
    words = tok.tokenize(lower_case)
    return (" ".join(words)).strip()
testing = df['text']
test_result = []
for t in testing:
    test_result.append(tweet_cleaner(t))
test_result

['islesapp wow brave people bit cold for me',
 'rt they have a stadium we have a home they have songs we have an anthem they have a manager we have a guardian they h',
 'rt sport king of spain philippe vi confirms his attendance at the final of copa del rey to be held coming saturday at the',
 'rt we are liverpool the story continues',
 'panathinaikos clear ex chelsea star michael essien debt in bid to avoid relegation',
 'rt chelsea winning against spurs is by far my favourite recent memory of us at wembley',
 'rt chelsea to rival arsenal for aleksandr golovin cfc',
 'rt we are liverpool the story continues',
 'united are starting voting for the poty awards just seen the shortlist for goal of the season and for me it s out',
 'rt they have a stadium we have a home they have songs we have an anthem they have a manager we have a guardian they h',
 'these people really think it s as simple as picking the right formation and plugging the players in',
 'rt we are liverpool the story contin

In [10]:
df.shape

(41640, 1)

In [11]:
nums = [0,10000,20000,30000,41000]
print("Cleaning and parsing the tweets...\n")
clean_tweet_texts = []
for i in range(nums[0],nums[-1]):
    if( (i+1)%10000 == 0 ):
        print("Tweets %d of %d has been processed" % ( i+1, nums[-1] ))                                                                    
    clean_tweet_texts.append(tweet_cleaner(df['text'][i]))

Cleaning and parsing the tweets...

Tweets 10000 of 41000 has been processed
Tweets 20000 of 41000 has been processed
Tweets 30000 of 41000 has been processed
Tweets 40000 of 41000 has been processed


In [12]:
clean_df = pd.DataFrame(clean_tweet_texts,columns=['text'])
clean_df.head()

Unnamed: 0,text
0,islesapp wow brave people bit cold for me
1,rt they have a stadium we have a home they hav...
2,rt sport king of spain philippe vi confirms hi...
3,rt we are liverpool the story continues
4,panathinaikos clear ex chelsea star michael es...


In [13]:
for i in range(0, len(clean_df)):
    if clean_df.text[i][0:2] == 'rt':
        clean_df.text[i] = clean_df.text[i][3:-1]

In [14]:
clean_df['tokenized_sents'] = clean_df.apply(lambda row: nltk.word_tokenize(row['text']), axis=1)

In [16]:
sid = SentimentIntensityAnalyzer()

In [17]:
clean_df['sentiments'] = clean_df.apply(lambda row: sid.polarity_scores(row['text']), axis=1)

In [18]:
clean_df.head()

Unnamed: 0,text,tokenized_sents,sentiments
0,islesapp wow brave people bit cold for me,"[islesapp, wow, brave, people, bit, cold, for,...","{'neg': 0.0, 'neu': 0.455, 'pos': 0.545, 'comp..."
1,they have a stadium we have a home they have s...,"[they, have, a, stadium, we, have, a, home, th...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,sport king of spain philippe vi confirms his a...,"[sport, king, of, spain, philippe, vi, confirm...","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,we are liverpool the story continue,"[we, are, liverpool, the, story, continue]","{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,panathinaikos clear ex chelsea star michael es...,"[panathinaikos, clear, ex, chelsea, star, mich...","{'neg': 0.272, 'neu': 0.578, 'pos': 0.15, 'com..."


In [19]:
new_df = clean_df[['text','sentiments']]

In [20]:
new_df.head()

Unnamed: 0,text,sentiments
0,islesapp wow brave people bit cold for me,"{'neg': 0.0, 'neu': 0.455, 'pos': 0.545, 'comp..."
1,they have a stadium we have a home they have s...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
2,sport king of spain philippe vi confirms his a...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
3,we are liverpool the story continue,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
4,panathinaikos clear ex chelsea star michael es...,"{'neg': 0.272, 'neu': 0.578, 'pos': 0.15, 'com..."


In [21]:
a = []
for i in range(0, len(new_df)):    
    a.append(max(new_df.sentiments[i].items(), key=operator.itemgetter(1))[0])

In [22]:
new_df['Sentiment'] = a

In [23]:
new_df.head()

Unnamed: 0,text,sentiments,Sentiment
0,islesapp wow brave people bit cold for me,"{'neg': 0.0, 'neu': 0.455, 'pos': 0.545, 'comp...",compound
1,they have a stadium we have a home they have s...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu
2,sport king of spain philippe vi confirms his a...,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu
3,we are liverpool the story continue,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",neu
4,panathinaikos clear ex chelsea star michael es...,"{'neg': 0.272, 'neu': 0.578, 'pos': 0.15, 'com...",neu


In [25]:
new_df = new_df.drop(['sentiments'], axis = 1)

In [26]:
new_df = new_df[new_df.Sentiment != 'compound']
new_df = new_df.replace('pos', 1)
new_df = new_df.replace('neg', -1)
new_df = new_df.replace('neu', 0)

In [27]:
new_df.head()

Unnamed: 0,text,Sentiment
1,they have a stadium we have a home they have s...,0
2,sport king of spain philippe vi confirms his a...,0
3,we are liverpool the story continue,0
4,panathinaikos clear ex chelsea star michael es...,0
5,chelsea winning against spurs is by far my fav...,0


In [29]:
X = new_df['text']
y = new_df['Sentiment']

In [30]:
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', naive_bayes.MultinomialNB()), ])

In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [32]:
print("\nFitting training data using Multinomial Naive Bayes...")
text_clf.fit(X_train, np.asarray(y_train, dtype = np.float64))  
print("Model fit.")
pred = text_clf.predict(X_test)
print("Accuracy of Multinomial Naive Bayes model: ", np.mean(pred == y_test)*100, "%")


Fitting training data using Multinomial Naive Bayes...
Model fit.
Accuracy of Multinomial Naive Bayes model:  93.7442137283 %


In [33]:
text_clf1 = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, random_state=42,
                                            max_iter=5, tol=None)),
 ])

In [34]:
print("\nFitting training data using SGDClassifier...")
text_clf1.fit(X_train, np.asarray(y_train, dtype = np.float64))
print("Model fit.")
pred1 = text_clf1.predict(X_test)
print("Accuracy of SGDClassifier model: ", np.mean(pred1 == y_test)*100, "%")


Fitting training data using SGDClassifier...
Model fit.
Accuracy of SGDClassifier model:  93.5987303267 %
