In [0]:
from sklearn.metrics import  accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split
from nltk.tokenize.casual import TweetTokenizer
from nltk.stem.lancaster import LancasterStemmer
from spacy.lang.en.stop_words import STOP_WORDS
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from wordcloud import WordCloud
from wordcloud import STOPWORDS
from textblob import TextBlob
from nltk.text import Text  
from pandas import Series
import seaborn as sns
import nltk, string
import pandas as pd
import numpy as np
import nltk as nlp
import warnings
import sys
import os
import re
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, Bidirectional, CuDNNLSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.models import model_from_json

from imblearn.over_sampling import SMOTE
import nltk
nltk.download('punkt')
nltk.download('wordnet')

STOPWORDS.add("rt")
STOPWORDS.add("s")
STOPWORDS.add("u")
STOPWORDS.add("amp")
STOPWORDS.add("th")
STOPWORDS.add("will")
STOPWORDS.add("t")
STOPWORDS.add("m")

Using TensorFlow backend.




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
df = pd.read_csv("/content/drive/My Drive/tweet/ExtractedTweets.csv")
df.dropna(axis = 0, inplace = True)
df["Party_log"] = [1 if each == "Democrat" else 0 for each in df.Party]

In [0]:
top_words = 10000  
tokenizer = TweetTokenizer(reduce_len=True)
tweets = df.copy()

# cleaning
tweet_arr = tweets.Tweet.to_numpy()
tweet_list = []
lemma = nlp.WordNetLemmatizer()

for d in tweet_arr:
    d = re.sub(r'http\S+', '', d) #remove links
    d = re.sub("[^a-zA-Z]", " ", d) #remove all characters except letters
    d = d.lower() #convert all words to lowercase
    d = nltk.word_tokenize(d) #split sentences into word
    d = [word for word in d if not word in STOPWORDS] #remove the stopwords
    d = [lemma.lemmatize(word) for word in d] #identify the correct form of the word in the dictionary
    d = " ".join(d)
    tweet_list.append(d)

tweet_arr = np.asarray(tweet_list)

tweets['Tweet'] = tweet_arr

# tokenizing
tweets['Tweet'] = tweets.Tweet.apply(tokenizer.tokenize)

# converting tweet tokens to freq dist ranks
fdist = FreqDist(word for tweet in tweets.Tweet for word in tweet)
terms = [term for term, count in fdist.most_common(top_words)]
tweets.Tweet = tweets.Tweet.apply(lambda tweet:
                                  [terms.index(term) if term in terms else 0 
                                   for term in tweet])

In [0]:
# padding every tweet to max review length

x = tweets.Tweet
y = tweets.Party_log

max_review_length = 50 
x = sequence.pad_sequences(x, maxlen=max_review_length)

print(x.shape)
print(y.shape)

(86460, 50)
(86460,)


In [0]:
# sourced from 
import keras.backend as K
def matthews_correlation(y_true, y_pred):
    y_pred_pos = K.round(K.clip(y_pred, 0, 1))
    y_pred_neg = 1 - y_pred_pos

    y_pos = K.round(K.clip(y_true, 0, 1))
    y_neg = 1 - y_pos

    tp = K.sum(y_pos * y_pred_pos)
    tn = K.sum(y_neg * y_pred_neg)

    fp = K.sum(y_neg * y_pred_pos)
    fn = K.sum(y_pos * y_pred_neg)

    numerator = (tp * tn - fp * fn)
    denominator = K.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    return numerator / (denominator + K.epsilon())

In [0]:
def get_model():
  embedding_vecor_length = 32
  model = Sequential()
  model.add(Embedding(top_words, embedding_vecor_length,
                      input_length=max_review_length))
  model.add(Conv1D(filters=32, kernel_size=3, padding='same',
                   activation='relu'))
  model.add(MaxPooling1D(pool_size=2))
  model.add(Bidirectional(LSTM(256, return_sequences=True)))
  model.add(Dropout(0.5))
  model.add(Bidirectional(LSTM(256)))
  model.add(Dropout(0.5))
  model.add(Dense(1024, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(512, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))
  model.compile(loss='binary_crossentropy',
                optimizer='adam', metrics=[matthews_correlation])
  return model

In [0]:
def get_stats(y_test, y_pred):
  from sklearn.metrics import matthews_corrcoef, confusion_matrix, accuracy_score, f1_score

  print('MCC: ', matthews_corrcoef(y_test, np.round(y_pred)))
  print('Accuracy Score: ', accuracy_score(y_test, np.round(y_pred)))
  print('F1 Score: ', f1_score(y_test, np.round(y_pred)))
  print('Confusion Matrix ')
  print(confusion_matrix(y_test, np.round(y_pred)))

In [0]:
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size=0.2, 
                                                    random_state=0)

In [0]:
model = get_model()
print(model.summary())
history = model.fit(x_train, y_train, epochs=5,
                    batch_size=64, validation_split=0.33)






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 32)            320000    
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 50, 32)            3104      
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 25, 32)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 25, 512)           591872    
_________________________________________________________________
dropout_1 (Dropout)          (None, 25, 512)           0         
____________________________________

In [0]:
y_pred = model.predict(x_test, use_multiprocessing=True)
get_stats(y_test, y_pred)

MCC:  0.5388015476495792
Accuracy Score:  0.7698935924126764
F1 Score:  0.7520099719538798
Confusion Matrix 
[[7280 1728]
 [2251 6033]]


In [0]:
# serialize model to JSON
model_json = model.to_json()
with open("/content/drive/My Drive/tweet/model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("/content/drive/My Drive/tweet/model.h5")
print("Saved model to disk")

Saved model to disk


In [0]:
# load json and create model
json_file = open('/content/drive/My Drive/tweet/model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("/content/drive/My Drive/tweet/model.h5")
print("Loaded model from disk")

Loaded model from disk


In [0]:
def clean_input(tweet):
    tweet = re.sub(r'http\S+', '', tweet) #remove links
    tweet = re.sub("[^a-zA-Z]", " ", tweet) #remove all characters except letters
    tweet = tweet.lower() #convert all words to lowercase
    tweet = nltk.word_tokenize(tweet) #split sentences into word
    tweet = [word for word in tweet if not word in STOPWORDS] #remove the stopwords
    tweet = [lemma.lemmatize(word) for word in tweet] #identify the correct form of the word in the dictionary
    tweet = " ".join(tweet)
    return tweet

def convert_input(tweet):
    tweet = clean_input(tweet)
    # tokenizing
    tweet = tokenizer.tokenize(tweet)
    # converting tweet tokens to freq dist ranks
    terms = [term for term, count in fdist.most_common(top_words)]
    encoded_tweet = [terms.index(term) if term in terms else 0 for term in tweet]
    tweet_arr = np.array([encoded_tweet])
    padded_tweet = sequence.pad_sequences(tweet_arr, maxlen=max_review_length)
    return padded_tweet

def predict_tweet(tweet):
    tweet = convert_input(tweet)
    prediction = loaded_model.predict(tweet)
    print(prediction)
    if prediction > 0.5:
      return 'Democrat'
    else:
      return 'Republican'

In [0]:
def make_predictions(tweets):
    stats = {}
    for t in tweets:
        res = predict_tweet(t)
        if res in stats:
            stats[res] += 1
        else:
            stats[res] = 1
    return stats

In [0]:
import tweepy

keys = dict(consumer_key="yuNQZSADRNqRiDj0U3oOWEaE9",
            consumer_secret="OuJVoxDFBTy7wiePcEW0d0RkuHSaQwr5niBTWAEMdASnTWcOrX",
            access_token='1168231254308712449-XuhViryRpYsrhYuhpETXImWFTHViis',
            access_token_secret='ZphiLvqrYIZpYsjGLeoD8lDDxU4ZgXC2wQT1WXpwJlykU'
            )


__author__ = 'Shivchander Sudalairaj'


class Tweet:
    def __init__(self, user_handle):
        """
        :param user_handle: twitter username without '@' symbol
        :return: class method
        """
        self._consumer_key = keys['consumer_key']
        self._consumer_secret = keys['consumer_secret']
        self._access_token = keys['access_token']
        self._access_token_secret = keys['access_token_secret']

        # configure OAUTH
        self.auth = tweepy.OAuthHandler(self._consumer_key, self._consumer_secret)
        self.auth.set_access_token(self._access_token, self._access_token_secret)

        # set up tweepy client
        self.api = tweepy.API(
            self.auth,
            wait_on_rate_limit=True,
            wait_on_rate_limit_notify=True,
            timeout=60,
            compression=True
        )

        self.user_handle = user_handle

    def get_friends(self):
        """
        :return: array containing the IDs of users being followed by self.
        """

        try:
            # get friends ids
            friends_ids = []
            for friend in tweepy.Cursor(self.api.friends_ids, screen_name=self.user_handle).pages():
                friends_ids.append(friend)

            # get twitter handles
            friends_handles = [user.screen_name for user in self.api.lookup_users(user_ids=friends_ids)]
            return friends_handles

        except tweepy.TweepError:
            print('Oops somethings not right, good luck figuring out what')
            return []

    def get_followers(self):
        """
        :return: array containing the IDs of users following self.
        """
        try:
            # get friends ids
            followers_ids = []
            for follower in tweepy.Cursor(self.api.followers_ids, id=self.user_handle).pages():
                followers_ids.append(follower)

            # get twitter handles
            followers_handles = [user.screen_name for user in self.api.lookup_users(user_ids=followers_ids)]
            return followers_handles

        except tweepy.TweepError:
            print('Oops somethings not right, good luck figuring out what')
            return []

    def get_tweets(self, limit=100):
        """
        :param limit: max limit of tweets
        :return: array containing the tweets from self.user_handle
        """
        try:
            tweets = []
            for obj in tweepy.Cursor(self.api.user_timeline, screen_name=self.user_handle,
                                     include_rts=False, tweet_mode='extended').items(limit):
                if len(tweets) < limit:
                    tweets.append(obj.full_text)
                else:
                    break
            return tweets

        except tweepy.TweepError:
            print('Oops somethings not right, good luck figuring out what')
            return []

    def get_retweets(self, limit=100):
        """
        :param limit: max limit of tweets
        :return: array containing the retweets from self.user_handle
        """

        try:
            retweets = []
            for obj in tweepy.Cursor(self.api.user_timeline, screen_name=self.user_handle,
                                     include_rts=True, tweet_mode='extended').items():
                if obj.full_text.startswith('RT'):
                    if len(retweets) < limit:
                        retweets.append(obj.full_text)
                    else:
                        break
            return retweets

        except tweepy.TweepError:
            print('Oops somethings not right, good luck figuring out what')
            return []

    def get_favtweets(self, limit=100):
        """
        :param limit: max limit of tweets
        :return: array containing the tweets favorite-ed by self.user_handle
        """

        try:
            favtweets = []
            for obj in tweepy.Cursor(self.api.favorites, id=self.user_handle).items(limit):
                if len(favtweets) < limit:
                    favtweets.append(obj.text)
                else:
                    break
            return favtweets

        except tweepy.TweepError:
            print('Oops somethings not right, good luck figuring out what')
            return []

    def get_location(self):
        """
        :return: location of the self
        """
        try:
            print(self.api.get_user(screen_name=self.user_handle).location)
        except tweepy.TweepError:
            print('Oops somethings not right, good luck figuring out what')
            return None




In [0]:
len(twats)

100

In [0]:
x = Tweet('michaelcburgess')
twats = x.get_tweets()
print(make_predictions(twats))

[[0.00011886]]
[[3.7343452e-06]]
[[0.9999999]]
[[0.00601882]]
[[1.0605115e-05]]
[[0.02232434]]
[[0.21906336]]
[[0.00027008]]
[[0.00207869]]
[[0.9240636]]
[[0.21803892]]
[[1.1970317e-07]]
[[1.1483521e-05]]
[[0.06434233]]
[[5.4345306e-05]]
[[0.00015291]]
[[0.00040688]]
[[0.7604855]]
[[0.99788314]]
[[0.11742231]]
[[0.]]
[[0.00105121]]
[[0.00054462]]
[[0.01844192]]
[[2.7575018e-07]]
[[0.9570874]]
[[0.00486509]]
[[0.03847144]]
[[0.46559602]]
[[1.]]
[[0.06542029]]
[[0.99882096]]
[[0.9999999]]
[[0.829195]]
[[0.99520206]]
[[0.00028185]]
[[0.22009279]]
[[5.902534e-07]]
[[0.01150742]]
[[0.38496304]]
[[0.06976964]]
[[0.9952192]]
[[0.98852867]]
[[0.00320679]]
[[0.2165661]]
[[0.07545079]]
[[0.00017789]]
[[0.37562644]]
[[0.15925439]]
[[3.4425448e-06]]
[[0.01492222]]
[[0.01836975]]
[[0.12205904]]
[[0.50705713]]
[[0.5148638]]
[[0.02035884]]
[[0.00257082]]
[[0.99985325]]
[[0.22752823]]
[[0.00098995]]
[[3.388642e-05]]
[[0.00017865]]
[[7.479429e-06]]
[[2.7436818e-05]]
[[0.00530713]]
[[0.94214666]]
[[1.07

https://xiangyutang2.github.io/tweet-classification/
https://www.kaggle.com/zaslee/bert-text-classification-demo-nlp-experiment
https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb
https://colab.research.google.com/github/tensorflow/tpu/blob/master/tools/colab/bert_finetuning_with_cloud_tpus.ipynb
https://colab.research.google.com/github/jalammar/jalammar.github.io/blob/master/notebooks/bert/A_Visual_Notebook_to_Using_BERT_for_the_First_Time.ipynb
https://github.com/sebsk/CS224N-Project
https://colab.research.google.com/drive/18SVeIFXWCiA9HL4WVCAFxlfH59ez6atc

