In [30]:
#obtain a report of different ml algorithm's influence on twitter data 

import re # for regular expressions
import pandas as pd 
pd.set_option("display.max_colwidth", 200)
import numpy as np 
import matplotlib.pyplot as plt 
import string
import nltk # for text manipulation
import warnings 
warnings.filterwarnings("ignore", category=DeprecationWarning)
%matplotlib inline
# from beautifultable import BeautifulTable  #report
from nltk.stem.porter import *
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import gensim

from sklearn.externals import joblib 

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,precision_score,recall_score

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score, accuracy_score


In [31]:
def import_tweets(filename, header = None):
    tweet_dataset = pd.read_csv(filename, header = header)
    tweet_dataset.columns = ['label','id','date','flag','user','text']
    for i in ['flag','id','user','date']: 
        del tweet_dataset[i] 
    tweet_dataset.label = tweet_dataset.label.replace(4,1)
    return tweet_dataset

In [11]:
tweets = import_tweets("train_140.csv")

In [16]:
#Preprocess the text in a single tweet
def preprocess_tweet(tweet):
    TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
    tweet = re.sub(TEXT_CLEANING_RE, ' ', str(tweet).lower())
    #convert all urls to sting "URL"
    tweet = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',tweet)
    #convert all @username to "AT_USER"
    tweet = re.sub('@[^\s]+','AT_USER', tweet)
    #correct all multiple white spaces to a single white space
    tweet = re.sub('[\s]+', ' ', tweet)
    #convert "#topic" to just "topic"
    tweet = re.sub(r'#([^\s]+)', r'\1', tweet)
    return tweet

def tokenization(tweet):
    tweet = ' '.join([w for w in tweet.split() if len(w)>2])
    tokenized_tweet = tweet.split()
    return tokenized_tweet

def stemming(tokenized_tweet):
    stemmer = PorterStemmer()

    tokenized_tweet = [stemmer.stem(i) for i in tokenized_tweet]

    #joining the tokenized tweet
    stemmed_tweet = ' '.join(tokenized_tweet)
    
    return stemmed_tweet


In [17]:
processed_tweets = tweets['text'].apply(preprocess_tweet)
tokenized_tweets = processed_tweets.apply(tokenization)
tweets['text'] = tokenized_tweets.apply(stemming)

In [97]:
# Uncomment this to Save tweet to a csv file
# tweets.to_csv('text.csv')

In [32]:
# Directly load the cleaned tweets
tweets = pd.read_csv('text.csv')

In [11]:
# Sample 100000 Tweets from each category
zero_tweets = tweets[tweets.label == 0].sample(n=100000, random_state=1)
one_tweets = tweets[tweets.label == 1].sample(n=100000, random_state=1)
tweets = pd.concat([zero_tweets, one_tweets])

In [42]:
#TF-IDF
def tf_idf(tweets):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
    tfidf = tfidf_vectorizer.fit_transform(tweets['text'].values.astype('U'))
    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl') 
    return tfidf

In [34]:
#bag of words 

def bow(tweets):    
    bow_vectorizer = CountVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
    bow = bow_vectorizer.fit_transform(tweets['text'])

In [35]:
#word to vec

def word_vector(tokens, size, model_w2v):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += model_w2v[word].reshape((1, size))
            count += 1.
        except KeyError: # handling the case where the token is not in vocabulary
            continue
    if count != 0:
        vec /= count
    return vec

def word2vec(tweets):
    
    tokenized_tweet = tweets['text'].apply(lambda x: x.split())
    model_w2v = gensim.models.Word2Vec(
                tokenized_tweet,
                size=200, # desired no. of features/independent variables 
                window=5, # context window size
                min_count=2,
                sg = 1, # 1 for skip-gram model
                hs = 0,
                negative = 10, # for negative sampling
                workers= 2, # no.of cores
                seed = 34)

    #word2vec feature set
    wordvec_arrays = np.zeros((len(tokenized_tweet), 200))

    for i in range(len(tokenized_tweet)):
        wordvec_arrays[i,:] = word_vector(tokenized_tweet.iloc[i], 200, model_w2v)

    wordvec_df = pd.DataFrame(wordvec_arrays)
    
    return wordvec_df

In [36]:
def run_feature_extraction(tweets, flag):
    if flag == 1:
        return bow(tweets)
    elif flag == 2:
        return tf_idf(tweets)
    elif flag == 3:
        return wordvec_df
    else:
        return

In [3]:
#Directly Load Word2Vec data for 1.6M tweets
features = joblib.load('wordvec_df.pkl')  

In [43]:
#Enter the appropriate flag number
features = run_feature_extraction(tweets, 2)
# Uncomment this to Save the model as a pickle in a file 
# joblib.dump(wordvec_df, 'wordvec_df_small.pkl') 

In [None]:
#Split the data
xtrain_data, xvalid_data, ytrain, yvalid = train_test_split(features, tweets['label'],  
                                                            random_state=42, 
                                                            test_size=0.2)

In [29]:
def train_model(xtrain_data, ytrain, flag):
    if flag == 1:
        model = LogisticRegression()
        model.fit(xtrain_data, ytrain)
    elif flag == 2:
        model = svm.SVC(kernel='linear', C=1, probability=True)
        model.fit(xtrain_data, ytrain)
    elif flag == 3:
        print "XG Boost"
        model = XGBClassifier(max_depth=5, n_estimators=2000, verbosity=2, n_jobs = 6)
        model.fit(xtrain_data, ytrain)
    elif flag == 4:
        print "Random Forest"
        model = RandomForestClassifier(n_estimators=500, random_state=11, verbose=1)
        model.fit(xtrain_data, ytrain)
    elif flag == 5:
        print "Gradient Boosting"
        model = GradientBoostingClassifier(n_estimators=1000, max_depth=5, verbose=1)
        model.fit(xtrain_data, ytrain)
    else:
        return
    return model

In [30]:
def predictions(model, xvalid_data, yvalid):
    prediction_prob = model.predict_proba(xvalid_data)[:,1]
    prediction_int = model.predict(xvalid_data)

    print "AUC: "+str(roc_auc_score(yvalid, prediction_prob))
    print "Accuracy: "+str(accuracy_score(yvalid, prediction_int))
    print "F1 score: "+str(f1_score(yvalid, prediction_int))

In [None]:
# Train the model by entering the appropriate flag number and predict
model = train_model(xtrain_data, ytrain, 4)
predictions(model, xvalid_data, yvalid)

Random Forest


In [44]:
# Predict a new tweet in real time

def predict_realtime(tweet):
    processed_tweet = preprocess_tweet(tweet)
    tokens = tokenization(tweet)
    clean_tweet = stemming(tokens)
    tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')  
    features = tfidf_vectorizer.transform([clean_tweet])
    
    load_model = joblib.load('xgb.pkl')  
    
    prediction_prob = load_model.predict_proba(features)[:,1]
    prediction_int = load_model.predict(features)

    if prediction_int == 0:
        print "Negative Tweet, Prediction Probability: "+str(prediction_prob)
    else:
        print "Positive Tweet, Prediction Probability: "+str(prediction_prob)

In [48]:
# Enter a Tweet 
realtime_tweet = "@ramesh Today I am feeling happy and awesome"
predict_realtime(realtime_tweet)

Positive Tweet, Prediction Probability: [ 0.77577937]
